In [1327]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

import datetime
from datetime import timedelta
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.nonparametric.smoothers_lowess import lowess

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV


from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.cross_decomposition import PLSRegression
from sklearn import ensemble
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB


from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error


import itertools

import graphviz

import copy
In [2]:
pd.options.display.max_columns = None
In [3]:
nfl = pd.read_csv('train.csv')
d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\IPython\core\interactiveshell.py:3063: DtypeWarning: Columns (47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [4]:
nfl.head()
Out[4]:
GameId PlayId Team X Y S A Dis Orientation Dir NflId DisplayName JerseyNumber Season YardLine Quarter GameClock PossessionTeam Down Distance FieldPosition HomeScoreBeforePlay VisitorScoreBeforePlay NflIdRusher OffenseFormation OffensePersonnel DefendersInTheBox DefensePersonnel PlayDirection TimeHandoff TimeSnap Yards PlayerHeight PlayerWeight PlayerBirthDate PlayerCollegeName Position HomeTeamAbbr VisitorTeamAbbr Week Stadium Location StadiumType Turf GameWeather Temperature Humidity WindSpeed WindDirection
0 2017090700 20170907000118 away 73.91 34.84 1.69 1.13 0.40 81.99 177.18 496723 Eric Berry 29 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 6-0 212 12/29/1988 Tennessee SS NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW
1 2017090700 20170907000118 away 74.67 32.64 0.42 1.35 0.01 27.61 198.70 2495116 Allen Bailey 97 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 6-3 288 03/25/1989 Miami DE NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW
2 2017090700 20170907000118 away 74.00 33.20 1.22 0.59 0.31 3.01 202.73 2495493 Justin Houston 50 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 6-3 270 01/21/1989 Georgia DE NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW
3 2017090700 20170907000118 away 71.46 27.70 0.42 0.54 0.02 359.77 105.64 2506353 Derrick Johnson 56 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 6-3 245 11/22/1982 Texas ILB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW
4 2017090700 20170907000118 away 69.32 35.42 1.82 2.43 0.16 12.63 164.31 2530794 Ron Parker 38 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 6-0 206 08/17/1987 Newberry FS NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW

Feature Engineering to select predictor set

In [5]:
# Query only data from Runners
runners = nfl.query('NflId == NflIdRusher')
runners = runners.reset_index(drop=True)
runners_featured = pd.DataFrame()
In [6]:
# Convert Runners Home/Away to Binary (home=1, away=0)
runners_featured['Team_Home'] = runners.Team.map(dict(home=1, away=0))
In [7]:
runners_featured['Dis'] = runners['Dis']
In [8]:
# Refactor Runner names to be a quantitative value (yards/game)

runners_df = pd.DataFrame({'DisplayName':runners['DisplayName'].unique()})
for player in runners['DisplayName'].unique():
    id = runners_df.index[runners_df['DisplayName']==player]
    strng = player
    runners_df.loc[id, 'sum_yards'] = runners.query("DisplayName == @strng")['Yards'].sum()
    runners_df.loc[id, 'games'] = runners.query("DisplayName == @strng")['GameId'].nunique()

# Calculate ypg for each runner
runners_df['ypg'] = runners_df.apply(lambda x: x['sum_yards']/x['games'] ,axis=1)
# Merge the runners df with the running plays dataframe to define Player using their ypg
runners = runners.merge(runners_df,'left',on='DisplayName')
In [9]:
runners.head()
Out[9]:
GameId PlayId Team X Y S A Dis Orientation Dir NflId DisplayName JerseyNumber Season YardLine Quarter GameClock PossessionTeam Down Distance FieldPosition HomeScoreBeforePlay VisitorScoreBeforePlay NflIdRusher OffenseFormation OffensePersonnel DefendersInTheBox DefensePersonnel PlayDirection TimeHandoff TimeSnap Yards PlayerHeight PlayerWeight PlayerBirthDate PlayerCollegeName Position HomeTeamAbbr VisitorTeamAbbr Week Stadium Location StadiumType Turf GameWeather Temperature Humidity WindSpeed WindDirection sum_yards games ypg
0 2017090700 20170907000118 home 78.75 30.53 3.63 3.35 0.38 161.98 245.74 2543773 James White 28 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 5-10 205 02/03/1992 Wisconsin RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 716.0 38.0 18.842105
1 2017090700 20170907000139 home 71.07 27.16 3.06 2.41 0.34 210.70 312.20 2543773 James White 28 2017 43 1 13:52:00 NE 1 10 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:27.000Z 2017-09-08T00:44:26.000Z 3 5-10 205 02/03/1992 Wisconsin RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 716.0 38.0 18.842105
2 2017090700 20170907000189 home 48.66 19.11 5.77 2.42 0.60 140.82 221.96 2543773 James White 28 2017 35 1 13:02:00 NE 1 10 KC 0 0 2543773 SINGLEBACK 1 RB, 1 TE, 3 WR 7.0 2 DL, 3 LB, 6 DB left 2017-09-08T00:45:17.000Z 2017-09-08T00:45:15.000Z 5 5-10 205 02/03/1992 Wisconsin RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 716.0 38.0 18.842105
3 2017090700 20170907000345 home 15.53 25.36 4.45 3.20 0.46 186.22 275.44 2539663 Mike Gillislee 35 2017 2 1 12:12:00 NE 2 2 KC 0 0 2539663 JUMBO 6 OL, 2 RB, 2 TE, 0 WR 9.0 4 DL, 4 LB, 3 DB left 2017-09-08T00:48:41.000Z 2017-09-08T00:48:39.000Z 2 5-11 210 11/01/1990 Florida RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 426.0 13.0 32.769231
4 2017090700 20170907000395 away 29.99 27.12 3.90 2.53 0.44 34.27 157.92 2557917 Kareem Hunt 27 2017 25 1 12:08:00 KC 1 10 KC 7 0 2557917 SHOTGUN 1 RB, 3 TE, 1 WR 7.0 3 DL, 2 LB, 6 DB right 2017-09-08T00:53:14.000Z 2017-09-08T00:53:13.000Z 7 5-11 216 08/06/1995 Toledo RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 2207.0 30.0 73.566667
In [10]:
runners_featured['ypg'] = runners['ypg']
In [11]:
# Determine Yard line (0: own end line, 100: touchdown line)
runners['YardLine_refactor'] = list(runners.apply(lambda x: x['YardLine'] if (x['PossessionTeam'] == x['FieldPosition']) else 100 - x['YardLine'] , axis=1))
runners_featured['YardLine_refactor'] = list(runners['YardLine_refactor'])
In [12]:
# Calculate Time Elapsed in seconds
runners['GameClock_timedata'] = np.array(pd.to_timedelta('00:' + runners['GameClock'].astype(str).str[:-3]))
runners['TimeElapsed'] = list(runners.apply(lambda x: (x['Quarter']-1)*pd.Timedelta(minutes=15) + pd.Timedelta(minutes=15) - x['GameClock_timedata'] , axis=1))
runners['TimeElapsed'] = runners['TimeElapsed'].dt.seconds
runners_featured['TimeElapsed'] = runners['TimeElapsed']
In [13]:
# Refactor Possession Team and Defense Team based on total running yards allowed
runners['DefenseTeam'] = list(runners.apply(lambda x: x['HomeTeamAbbr'] if (x['PossessionTeam'] == x['VisitorTeamAbbr']) else x['VisitorTeamAbbr'] , axis=1))
In [14]:
# Gather Total Rushing Data for each team
team_rushing_df = pd.DataFrame({'PossessionTeam':runners['PossessionTeam'].unique()})
for team in runners['PossessionTeam'].unique():
    id = team_rushing_df.index[team_rushing_df['PossessionTeam']==team]
    strng = team
    team_rushing_df.loc[id, 'team_rushing_yards'] = runners.query("PossessionTeam == @strng")['Yards'].sum()
In [15]:
# Gather Total Defensive Rushing yards allowed for each team
team_defense_df = pd.DataFrame({'DefenseTeam':runners['DefenseTeam'].unique()})
for team in runners['DefenseTeam'].unique():
    id = team_defense_df.index[team_defense_df['DefenseTeam']==team]
    strng = team
    team_defense_df.loc[id, 'team_defense_rushing_yards'] = runners.query("DefenseTeam == @strng")['Yards'].sum()
In [16]:
# Merge the team total rushing df and defense df with the running plays dataframe to define Possession Team and Defensive teams using their rushing yards and rushing yards allowed
runners = runners.merge(team_rushing_df,'left',on='PossessionTeam')
runners = runners.merge(team_defense_df,'left',on='DefenseTeam')

runners_featured['team_rushing_yards'] = runners['team_rushing_yards']
runners_featured['team_defense_rushing_yards'] = runners['team_defense_rushing_yards']
In [17]:
# Down and Distance
# TODO: probably should consider combining these
runners_featured['Down'] = runners['Down']
runners_featured['Distance'] = runners['Distance']
In [18]:
# Convert Home and Away Score to Offense/ Defense Score
runners['PossessionScore'] = list(runners.apply(lambda x: x['HomeScoreBeforePlay'] if (x['Team'] == 'home') else x['VisitorScoreBeforePlay'] , axis=1))
runners['DefenseScore'] = list(runners.apply(lambda x: x['HomeScoreBeforePlay'] if (x['Team'] == 'away') else x['VisitorScoreBeforePlay'] , axis=1))
runners['ScoreDifferential'] = runners['PossessionScore'] - runners['DefenseScore']
# TODO: look into combine with time remaining

runners_featured['PossessionScore'] = runners['PossessionScore']
runners_featured['DefenseScore'] = runners['DefenseScore']
runners_featured['ScoreDifferential'] = runners['ScoreDifferential']
In [19]:
# Offense Personel
# OL: if nan assumed to be 5 (typical personel)
runners['OL'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sOL+')
runners['OL'] = runners['OL'].fillna(5)
runners['TE'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sTE+')
runners['WR'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sWR+')
runners['RB'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sRB+')

runners_featured['OL'] = runners['OL']
runners_featured['TE'] = runners['TE']
runners_featured['WR'] = runners['WR']
runners_featured['RB'] = runners['RB']
In [20]:
# Offense Formation
# Determine if Shotgun/Pistol, Wildcat or under center
runners['Shotgun'] = list(runners.apply(lambda x: 1 if (x['OffenseFormation'] == 'SHOTGUN' or x['OffenseFormation'] == 'PISTOL') else 0, axis=1))
runners['Wildcat'] = list(runners.apply(lambda x: 1 if (x['OffenseFormation'] == 'WILDCAT') else 0, axis=1))

runners_featured['Shotgun'] = runners['Shotgun']
runners_featured['Wildcat'] = runners['Wildcat']
In [22]:
# Defense Personel
runners['DL'] = runners['DefensePersonnel'].str.extract(r'.*([0-9])\sDL+')
runners['LB'] = runners['DefensePersonnel'].str.extract(r'.*([0-9])\sLB+')
runners['DB'] = runners['DefensePersonnel'].str.extract(r'.*([0-9])\sDB+')

runners_featured['DL'] = runners['DL']
runners_featured['LB'] = runners['LB']
runners_featured['DB'] = runners['DB']
In [23]:
# Defenders in the box
# One NaN value re-factored to be DL + LB
runners.DefendersInTheBox.fillna(runners.DL + runners.LB, inplace=True)
runners_featured['DefendersInTheBox'] = runners['DefendersInTheBox']
In [24]:
# Play Direction
runners['PlayDirection_Left'] = runners.PlayDirection.map(dict(left=1, right=0))
runners_featured['PlayDirection_Left'] = runners['PlayDirection_Left']
In [25]:
# Runner Height
runners['PlayerHeight_inches'] = np.float_(runners['PlayerHeight'].str.extract(r'(\d+)-(\d+)')[0]) * 12  + np.float_(runners['PlayerHeight'].str.extract(r'(\d+)-(\d+)')[1])
runners_featured['PlayerHeight_inches'] = runners['PlayerHeight_inches']
In [26]:
# Runner Weight
runners_featured['PlayerWeight'] = runners['PlayerWeight']
In [27]:
# Runner Age: time at snap - birth date
runners['Age_days'] = pd.to_datetime(runners['TimeSnap'].astype(str).str[:10]) - pd.to_datetime(runners['PlayerBirthDate'])
runners['Age_days'] = runners['Age_days'].dt.days
runners_featured['Age_days'] = runners['Age_days']
In [28]:
# Position
runners['Runner_RB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'RB' else 0, axis=1))
runners['Runner_WR'] = list(runners.apply(lambda x: 1 if x['Position'] == 'WR' else 0, axis=1))
runners['Runner_FB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'FB' else 0, axis=1))
runners['Runner_HB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'HB' else 0, axis=1))
runners['Runner_QB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'QB' else 0, axis=1))

runners_featured['Runner_RB'] = runners['Runner_RB']
runners_featured['Runner_WR'] = runners['Runner_WR']
runners_featured['Runner_FB'] = runners['Runner_FB']
runners_featured['Runner_HB'] = runners['Runner_HB']
runners_featured['Runner_QB'] = runners['Runner_QB']
In [29]:
# Week
runners_featured['Week'] = runners['Week']
In [30]:
# Stadium
outdoors_names = ['Outdoor', 'Outdoors', 'Outddors', 'Oudoor', 'Ourdoor', 'Heinz Field', 'Outdor', 'Cloudy', 'Bowl', 'Outside', 'OUTDOOR']
indoor_names = ['Indoors', 'Indoor', 'Retr. Roof-Closed', 'Retr. Roof - Closed','Dome', 'Domed, closed','Indoor, Roof Closed', 'Retr. Roof Closed','Closed Dome','Dome, closed','Domed', 'Indoor','indoor','Retractable Roof - Closed']
retractable_open_names = ['Retractable Roof', 'Open', 'Indoor, Open Roof','Outdoor Retr Roof-Open','Retr. Roof-Open','Domed, Open', 'Domed, open','Indoor, roof open',]

runners['Outdoors'] = list(runners.apply(lambda x: 1 if x['StadiumType'] in outdoors_names else 0, axis=1))
runners['Indoors'] = list(runners.apply(lambda x: 1 if x['StadiumType'] in indoor_names else 0, axis=1))
runners['Retractable_open'] = list(runners.apply(lambda x: 1 if x['StadiumType'] in retractable_open_names else 0, axis=1))

runners_featured['Outdoors'] = runners['Outdoors']
runners_featured['Indoors'] = runners['Indoors']
runners_featured['Retractable_open'] = runners['Retractable_open']
In [31]:
# Turf
grass_names = ['Grass', 'Natural Grass','Natural grass', 'grass', 'Natural','Naturall Grass','natural grass']
runners['Natural_Grass'] = list(runners.apply(lambda x: 1 if x['Turf'] in grass_names else 0, axis=1))
runners_featured['Natural_Grass'] = runners['Natural_Grass']
In [32]:
# Weather: rainy/snowy weather and cold weather
rainy_weather = ['Light Rain', 'Showers','Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Rain','Cloudy, fog started developing in 2nd quarter',
       'Rain likely, temps in low 40s.','Cloudy, light snow accumulating 1-3"','Heavy lake effect snow', 'Snow','Scattered Showers',
                'Cloudy, 50% change of rain','Cloudy, Rain', 'Rain shower','Rainy','Light rain','Cloudy with showers and wind','Raining', 'Rain and Wind']

runners['rainy_weather'] = list(runners.apply(lambda x: 1 if x['GameWeather'] in rainy_weather else 0, axis=1))

runners_featured['rainy_weather'] = runners['rainy_weather']
In [33]:
# Refactor NaNs in temperature and humidity to be stadiums average temp/ humidity
stadium_df = pd.DataFrame({'Stadium':runners[runners['Temperature'].isnull()]['Stadium'].unique()})
for stadium in runners[runners['Temperature'].isnull()]['Stadium'].unique():
    id = stadium_df.index[stadium_df['Stadium']==stadium]
    strng = stadium
    stadium_df.loc[id, 'average_temperature'] = runners.query("Stadium == @strng")['Temperature'].mean()
    stadium_df.loc[id, 'average_humidity'] = runners.query("Stadium == @strng")['Humidity'].mean()

runners = runners.merge(stadium_df,'left',on='Stadium')

runners.Temperature.fillna(runners.average_temperature, inplace=True)
runners.Humidity.fillna(runners.average_humidity, inplace=True)

runners_featured['Humidity'] = runners['Humidity']
runners_featured['Temperature'] = runners['Temperature']
# runners_featured['WindSpeed'] = runners['WindSpeed']

Scale Predictors

In [34]:
bool_cols = [col for col in runners_featured 
             if np.isin(runners_featured[col].dropna().unique(), [0, 1]).all()]
continuous_cols = [col for col in runners_featured 
             if not np.isin(runners_featured[col].dropna().unique(), [0, 1]).all()]
In [35]:
runners_featured_bool = runners_featured[bool_cols]
In [36]:
runners_featured_cont = runners_featured[continuous_cols]
In [37]:
scaler = StandardScaler()
scaler.fit(runners_featured_cont)
runners_featured_cont_scaled = scaler.transform(runners_featured_cont)
runners_featured_cont_scaled_df = pd.DataFrame(runners_featured_cont_scaled, index=runners_featured_cont.index, columns=runners_featured_cont.columns)
In [38]:
runners_scaled_df = runners_featured_bool.merge(runners_featured_cont_scaled_df, left_index=True, right_index=True)

Data Visualization

In [39]:
fig = px.scatter_matrix(runners_featured.iloc[:,:10])
fig.show()
In [41]:
fig_4 = px.histogram( runners[['Yards']])
fig_4.show()

This histogram of all the different rushing yards shows that it is somewhat right skewed and we note that in the models and when running diagnostics.

Correlation Matrix

In [43]:
runners_scaled_response_df = runners_scaled_df.merge(runners['Yards'], left_index=True, right_index=True)
In [44]:
correlation_cols = runners_scaled_response_df.columns
corr = runners_scaled_response_df[correlation_cols].corr()
corr.style.background_gradient(cmap='coolwarm')
Out[44]:
Team_Home Shotgun Wildcat PlayDirection_Left Runner_RB Runner_WR Runner_FB Runner_HB Runner_QB Outdoors Indoors Retractable_open Natural_Grass rainy_weather Dis ypg YardLine_refactor TimeElapsed team_rushing_yards team_defense_rushing_yards Down Distance PossessionScore DefenseScore ScoreDifferential OL TE WR RB DL LB DB DefendersInTheBox PlayerHeight_inches PlayerWeight Age_days Week Humidity Temperature Yards
Team_Home 1 -0.00341153 -0.00749331 0.00157337 0.00645546 -0.00321319 -0.00493891 -0.00147214 -0.0067029 -0.00652207 0.00835539 0.0100885 -0.0128797 -0.00150023 0.00532039 0.00197373 0.0162591 0.000848083 0.00259465 -0.1326 0.000361328 -0.00277397 0.0597553 -0.0713706 0.118498 -0.00815497 0.0047435 -0.00718759 0.0131403 0.00203846 0.00481745 -0.00708466 0.00769597 -0.00176908 0.00201242 0.00243805 0.00840256 -0.0232389 0.000470429 0.00712216
Shotgun -0.00341153 1 -0.0353638 -0.00952423 0.0206384 -0.0516742 -0.0249678 0.0271001 0.0277453 0.03776 -0.0620468 0.0177511 0.0512573 0.00341703 -0.356689 -0.0452128 -0.00653286 0.00701729 -0.0638755 0.0425688 0.0900239 0.0678293 -0.0348381 0.0615374 -0.0860764 -0.128274 -0.305263 0.412041 -0.214916 -0.170177 -0.143377 0.375441 -0.388805 -0.035411 -0.016035 -0.0473321 -0.0107138 0.0499904 -0.0118665 0.0281184
Wildcat -0.00749331 -0.0353638 1 -0.00372767 -0.0147158 0.0158516 -0.00411024 -0.00111373 0.0316344 -0.00743554 -0.000346249 0.020094 -0.00728078 0.00666319 -0.020127 -0.00855981 0.020427 0.000700311 0.0207578 0.000833483 0.0202763 -0.012824 -0.00899614 0.0068422 -0.0145125 -0.00378182 -0.0226103 -0.0224333 0.0104475 -0.00607437 -0.0077001 0.0166017 -0.0136442 -0.0128484 -0.0112693 -0.00578191 0.00432249 0.0122008 -0.00177049 0.00346331
PlayDirection_Left 0.00157337 -0.00952423 -0.00372767 1 -0.00642313 0.0143858 -0.00356424 -0.00266064 0.00116202 -0.000415712 -0.00209498 0.00594095 -0.0052273 -0.00242505 0.0188323 -0.00190248 -0.0026083 -0.00848466 0.000521269 0.0014761 -0.00284757 -0.00976936 -0.00730163 -0.0069499 -0.00111843 0.0110314 0.0090295 -0.0100106 -0.00304476 0.00622715 -0.00114103 -0.00523611 0.00579607 -0.00331484 -0.00541411 -0.00406912 0.0113771 -0.0010606 -0.0151276 0.00638574
Runner_RB 0.00645546 0.0206384 -0.0147158 -0.00642313 1 -0.611274 -0.304008 -0.646408 -0.20886 -0.0528295 0.0311376 0.0341321 0.0498665 0.0156188 -0.172022 0.283521 -0.00794203 0.0193636 0.125993 0.000976624 -0.014327 0.0106698 0.0270744 -0.0109976 0.0354957 0.0177491 0.0169172 -0.0543958 0.0701653 0.0103925 0.013909 -0.0287042 0.0308935 -0.0973209 0.118113 0.0369576 -0.0174079 -0.0156432 0.020272 -0.0197326
Runner_WR -0.00321319 -0.0516742 0.0158516 0.0143858 -0.611274 1 -0.013645 -0.0290132 -0.00937443 0.0133077 -0.00632377 -0.0158014 -0.00279277 -0.0108617 0.317397 -0.297926 0.00496401 -0.037085 0.0325508 -0.0188538 -0.0201954 0.015126 -0.0172585 -0.0242332 0.00401193 -0.0268286 -0.0362206 0.0831503 -0.0838221 0.0101626 -0.0456454 0.0433673 -0.0335469 0.00368806 -0.275713 0.0277457 -0.00247579 0.00305823 -0.00188456 0.0482147
Runner_FB -0.00493891 -0.0249678 -0.00411024 -0.00356424 -0.304008 -0.013645 1 -0.0144293 -0.00466223 0.00413882 0.00565073 -0.0125428 -0.0179101 0.0176281 -0.0322455 -0.14546 0.0197126 0.0100858 0.0151859 -0.0112131 0.0616912 -0.0459006 0.00324248 -0.00518451 0.00754845 0.0324264 0.015037 -0.0671153 0.0866657 0.0189803 0.0391757 -0.070833 0.054647 0.0111537 0.161166 0.0875658 0.0122184 -0.00257052 -0.0125409 -0.0150071
Runner_HB -0.00147214 0.0271001 -0.00111373 -0.00266064 -0.646408 -0.0290132 -0.0144293 1 -0.00991323 0.0632323 -0.0444947 -0.0267345 -0.0624011 -0.0197113 -0.00885453 -0.00659766 -0.00765926 0.00238722 -0.230702 0.0219427 -0.00180655 0.00300264 -0.0248978 0.0477899 -0.0647655 -0.0139515 -0.0120879 0.0437906 -0.0549868 -0.0393418 0.00528049 0.0398134 -0.0460815 0.075049 -0.0230671 -0.131825 0.0216787 0.020979 -0.0231456 -0.00529897
Runner_QB -0.0067029 0.0277453 0.0316344 0.00116202 -0.20886 -0.00937443 -0.00466223 -0.00991323 1 0.00452035 0.0049401 -0.00950322 0.00143293 -0.00205357 -0.0927651 -0.117624 0.0073269 0.00237402 0.0107059 -0.00414528 0.0141874 -0.012814 0.00294967 -0.0111759 0.0123743 -0.0110324 -0.00428908 0.0123747 -0.0193461 -0.00223768 -0.00883362 0.0134294 -0.00922663 0.112233 0.0240982 0.0267159 -0.00674732 -0.00385267 0.0120687 -0.0136083
Outdoors -0.00652207 0.03776 -0.00743554 -0.000415712 -0.0528295 0.0133077 0.00413882 0.0632323 0.00452035 1 -0.723178 -0.368592 0.429274 0.113721 -0.00834346 -0.0310499 0.00293028 0.000917795 -0.00971059 0.00267464 0.00520191 -0.00131068 -0.0141634 -0.00104975 -0.0127686 -0.00673531 -0.00316586 0.006971 -0.0087456 -0.0684959 0.0640662 0.00334944 -0.0182991 -0.0314935 -0.00794552 0.0205515 0.01397 0.312905 -0.206623 0.00644375
Indoors 0.00835539 -0.0620468 -0.000346249 -0.00209498 0.0311376 -0.00632377 0.00565073 -0.0444947 0.0049401 -0.723178 1 -0.12549 -0.401119 -0.11914 0.0349254 0.0191923 0.00683267 -0.00381464 0.0209837 -0.0468693 -0.00189136 0.000633449 0.0110275 0.0103556 0.00180908 0.00244387 0.00494756 -0.0086941 0.00811679 0.0885025 -0.0698139 -0.0206371 0.0264961 -0.0144472 -0.0234314 -0.00279715 -0.00672273 -0.398709 0.12632 -0.00485295
Retractable_open 0.0100885 0.0177511 0.020094 0.00594095 0.0341321 -0.0158014 -0.0125428 -0.0267345 -0.00950322 -0.368592 -0.12549 1 -0.13015 -0.0522747 -0.0260629 0.0437359 -0.0205889 -0.00288711 -0.00450725 0.0386805 -0.0078597 -0.00241257 -0.00405114 -0.0152488 0.00909313 0.0226695 -0.00586478 -0.00631822 0.0127841 -0.0175827 0.0338212 -0.019689 0.0226489 0.0460468 0.0532846 0.0135183 -0.0156466 0.0107649 0.154285 -0.0136795
Natural_Grass -0.0128797 0.0512573 -0.00728078 -0.0052273 0.0498665 -0.00279277 -0.0179101 -0.0624011 0.00143293 0.429274 -0.401119 -0.13015 1 -0.0623343 -0.0178597 0.00098254 -0.00975448 0.00412681 0.00462543 0.0882189 -0.00198169 0.00996845 -0.0168874 -0.0146266 -0.00382067 -0.0256133 0.017158 0.000483546 -0.0121271 -0.0675123 0.0716298 -0.00692132 -0.0111136 0.0498165 0.0427893 -0.00741878 -0.00805066 -0.028204 0.155423 -0.00014642
rainy_weather -0.00150023 0.00341703 0.00666319 -0.00242505 0.0156188 -0.0108617 0.0176281 -0.0197113 -0.00205357 0.113721 -0.11914 -0.0522747 -0.0623343 1 -0.0268526 0.00319347 -0.00509531 0.00261859 0.0129912 0.0499115 0.00198092 0.0114542 -0.028428 -0.0373883 0.00445278 0.0188663 0.0139808 -0.0126869 -0.0128311 -0.035616 0.0350712 0.000258618 0.0149243 -0.0145416 0.0254921 0.0217452 -0.00838878 0.349886 -0.147375 -0.00588561
Dis 0.00532039 -0.356689 -0.020127 0.0188323 -0.172022 0.317397 -0.0322455 -0.00885453 -0.0927651 -0.00834346 0.0349254 -0.0260629 -0.0178597 -0.0268526 1 -0.0525512 0.00584519 -0.0413023 0.0976713 -0.0375052 -0.0651555 -0.0178841 -0.00511064 -0.0459467 0.0342444 -0.00167892 0.0737655 -0.102119 0.0764039 0.0611269 0.0216906 -0.0987371 0.0877398 -0.0236384 -0.143347 0.0185569 -0.00100065 -0.040752 0.0145079 0.0519313
ypg 0.00197373 -0.0452128 -0.00855981 -0.00190248 0.283521 -0.297926 -0.14546 -0.00659766 -0.117624 -0.0310499 0.0191923 0.0437359 0.00098254 0.00319347 -0.0525512 1 -0.0255412 -0.0489945 0.165125 -0.0085553 -0.0465834 -0.0082103 -0.0374414 -0.040623 -0.00148469 0.00320314 0.0279883 -0.0423608 0.0428936 -0.00531003 0.0519233 -0.0564161 0.0450868 0.181879 0.278195 -0.163255 -0.0573048 -0.0505708 0.0289511 0.0277638
YardLine_refactor 0.0162591 -0.00653286 0.020427 -0.0026083 -0.00794203 0.00496401 0.0197126 -0.00765926 0.0073269 0.00293028 0.00683267 -0.0205889 -0.00975448 -0.00509531 0.00584519 -0.0255412 1 0.0831257 0.0614055 0.00901266 0.0921483 -0.283417 0.0271057 0.0102749 0.0173889 0.0765109 0.0335952 -0.0737694 0.0216914 0.0687065 0.00400727 -0.0879869 0.162531 0.00130746 -0.00189203 -0.0182417 -0.00113208 0.00583143 -0.0154099 -0.0945703
TimeElapsed 0.000848083 0.00701729 0.000700311 -0.00848466 0.0193636 -0.037085 0.0100858 0.00238722 0.00237402 0.000917795 -0.00381464 -0.00288711 0.00412681 0.00261859 -0.0413023 -0.0489945 0.0831257 1 0.0421813 0.0209567 0.0625062 -0.0201877 0.732121 0.627461 0.171308 0.0277457 0.0355594 -0.03709 -0.00614516 0.00644398 0.00222673 -0.0111334 0.0538473 0.0219289 0.0123897 -0.0224977 -0.0102956 -0.0084899 0.00417279 -0.00881821
team_rushing_yards 0.00259465 -0.0638755 0.0207578 0.000521269 0.125993 0.0325508 0.0151859 -0.230702 0.0107059 -0.00971059 0.0209837 -0.00450725 0.00462543 0.0129912 0.0976713 0.165125 0.0614055 0.0421813 1 0.0121524 0.020906 -0.0372798 0.129637 -0.0546376 0.171647 -0.0137127 -0.0236307 -0.0507719 0.119384 -0.00873156 0.0457337 -0.0443953 0.04178 -0.0256887 -0.0160724 -0.047541 -0.013461 -0.051114 0.00750955 0.0340664
team_defense_rushing_yards -0.1326 0.0425688 0.000833483 0.0014761 0.000976624 -0.0188538 -0.0112131 0.0219427 -0.00414528 0.00267464 -0.0468693 0.0386805 0.0882189 0.0499115 -0.0375052 -0.0085553 0.00901266 0.0209567 0.0121524 1 -0.00410109 0.00605925 0.0516031 -0.0111659 0.0593025 -9.5299e-05 0.0312594 -0.0252747 -0.00802567 -0.113678 0.112031 -0.00323354 0.0187291 0.0240511 0.0286861 0.02165 -0.0132898 -0.00646185 0.0093175 -0.000533937
Down 0.000361328 0.0900239 0.0202763 -0.00284757 -0.014327 -0.0201954 0.0616912 -0.00180655 0.0141874 0.00520191 -0.00189136 -0.0078597 -0.00198169 0.00198092 -0.0651555 -0.0465834 0.0921483 0.0625062 0.020906 -0.00410109 1 -0.490274 0.0365476 0.0140362 0.0232906 0.0570439 -0.0157759 -0.00686133 -0.0050066 -0.0107126 -0.0233478 0.0405343 0.0161745 -0.00474451 0.00867363 0.00082371 0.0136101 0.00237481 -0.0130116 -0.0234871
Distance -0.00277397 0.0678293 -0.012824 -0.00976936 0.0106698 0.015126 -0.0459006 0.00300264 -0.012814 -0.00131068 0.000633449 -0.00241257 0.00996845 0.0114542 -0.0178841 -0.0082103 -0.283417 -0.0201877 -0.0372798 0.00605925 -0.490274 1 0.01134 -0.0169616 0.0254016 -0.122244 -0.0846761 0.152988 -0.065351 -0.108779 -0.0219124 0.15652 -0.250291 -0.0293665 -0.045556 -0.000955586 -0.0127805 -0.00888934 0.0191181 0.0706323
PossessionScore 0.0597553 -0.0348381 -0.00899614 -0.00730163 0.0270744 -0.0172585 0.00324248 -0.0248978 0.00294967 -0.0141634 0.0110275 -0.00405114 -0.0168874 -0.028428 -0.00511064 -0.0374414 0.0271057 0.732121 0.129637 0.0516031 0.0365476 0.01134 1 0.399757 0.623877 0.0120865 0.0592641 -0.0713552 0.0329428 0.0410116 0.00978239 -0.0612067 0.0866558 0.00289749 -0.00542442 -0.044016 -0.0332927 -0.00614978 0.0164845 0.00146403
DefenseScore -0.0713706 0.0615374 0.0068422 -0.0069499 -0.0109976 -0.0242332 -0.00518451 0.0477899 -0.0111759 -0.00104975 0.0103556 -0.0152488 -0.0146266 -0.0373883 -0.0459467 -0.040623 0.0102749 0.627461 -0.0546376 -0.0111659 0.0140362 -0.0169616 0.399757 1 -0.46696 -0.00965613 -0.0373475 0.0526961 -0.0337454 -0.0129549 -0.051624 0.0769552 -0.052374 -0.0002924 -0.023961 -0.0414156 -0.0315401 -0.0038571 0.0156642 0.00270542
ScoreDifferential 0.118498 -0.0860764 -0.0145125 -0.00111843 0.0354957 0.00401193 0.00754845 -0.0647655 0.0123743 -0.0127686 0.00180908 0.00909313 -0.00382067 0.00445278 0.0342444 -0.00148469 0.0173889 0.171308 0.171647 0.0593025 0.0232906 0.0254016 0.623877 -0.46696 1 0.0198929 0.0890159 -0.113767 0.0605522 0.0506101 0.0534525 -0.12466 0.128253 0.00304456 0.0151964 -0.0071515 -0.00522648 -0.00264418 0.00254735 -0.000894301
OL -0.00815497 -0.128274 -0.00378182 0.0110314 0.0177491 -0.0268286 0.0324264 -0.0139515 -0.0110324 -0.00673531 0.00244387 0.0226695 -0.0256133 0.0188663 -0.00167892 0.00320314 0.0765109 0.0277457 -0.0137127 -9.5299e-05 0.0570439 -0.122244 0.0120865 -0.00965613 0.0198929 1 -0.0321402 -0.35143 0.066272 0.149553 0.0963472 -0.295103 0.296037 0.0451191 0.0989499 0.0220654 0.054205 0.0558175 -0.0410761 -0.0367644
TE 0.0047435 -0.305263 -0.0226103 0.0090295 0.0169172 -0.0362206 0.015037 -0.0120879 -0.00428908 -0.00316586 0.00494756 -0.00586478 0.017158 0.0139808 0.0737655 0.0279883 0.0335952 0.0355594 -0.0236307 0.0312594 -0.0157759 -0.0846761 0.0592641 -0.0373475 0.0890159 -0.0321402 1 -0.762686 -0.0998036 0.193071 0.241469 -0.522045 0.450879 0.0453396 0.0955152 0.0239303 -0.0364339 -0.00988556 0.0222385 -0.049904
WR -0.00718759 0.412041 -0.0224333 -0.0100106 -0.0543958 0.0831503 -0.0671153 0.0437906 0.0123747 0.006971 -0.0086941 -0.00631822 0.000483546 -0.0126869 -0.102119 -0.0423608 -0.0737694 -0.03709 -0.0507719 -0.0252747 -0.00686133 0.152988 -0.0713552 0.0526961 -0.113767 -0.35143 -0.762686 1 -0.447799 -0.287083 -0.312721 0.720324 -0.619565 -0.0443981 -0.111976 -0.0340228 0.00620599 -0.00176324 -8.93363e-05 0.0586811
RB 0.0131403 -0.214916 0.0104475 -0.00304476 0.0701653 -0.0838221 0.0866657 -0.0549868 -0.0193461 -0.0087456 0.00811679 0.0127841 -0.0121271 -0.0128311 0.0764039 0.0428936 0.0216914 -0.00614516 0.119384 -0.00802567 -0.0050066 -0.065351 0.0329428 -0.0337454 0.0605522 0.066272 -0.0998036 -0.447799 1 0.138345 0.147448 -0.342644 0.259895 -0.0114731 0.00166273 0.011674 0.00940146 -0.0250473 -0.00203799 -0.00738116
DL 0.00203846 -0.170177 -0.00607437 0.00622715 0.0103925 0.0101626 0.0189803 -0.0393418 -0.00223768 -0.0684959 0.0885025 -0.0175827 -0.0675123 -0.035616 0.0611269 -0.00531003 0.0687065 0.00644398 -0.00873156 -0.113678 -0.0107126 -0.108779 0.0410116 -0.0129549 0.0506101 0.149553 0.193071 -0.287083 0.138345 1 -0.651873 -0.397574 0.302462 0.0188322 0.0420612 0.0302634 0.0318307 -0.0419872 0.0508485 -0.0324304
LB 0.00481745 -0.143377 -0.0077001 -0.00114103 0.013909 -0.0456454 0.0391757 0.00528049 -0.00883362 0.0640662 -0.0698139 0.0338212 0.0716298 0.0350712 0.0216906 0.0519233 0.00400727 0.00222673 0.0457337 0.112031 -0.0233478 -0.0219124 0.00978239 -0.051624 0.0534525 0.0963472 0.241469 -0.312721 0.147448 -0.651873 1 -0.435445 0.245457 0.0326116 0.0488133 0.00841702 -0.0338281 0.0292764 -0.0360203 -0.0321461
DB -0.00708466 0.375441 0.0166017 -0.00523611 -0.0287042 0.0433673 -0.070833 0.0398134 0.0134294 0.00334944 -0.0206371 -0.019689 -0.00692132 0.000258618 -0.0987371 -0.0564161 -0.0879869 -0.0111334 -0.0443953 -0.00323354 0.0405343 0.15652 -0.0612067 0.0769552 -0.12466 -0.295103 -0.522045 0.720324 -0.342644 -0.397574 -0.435445 1 -0.655911 -0.0613821 -0.108888 -0.0460073 0.0040335 0.0147158 -0.0181336 0.0771527
DefendersInTheBox 0.00769597 -0.388805 -0.0136442 0.00579607 0.0308935 -0.0335469 0.054647 -0.0460815 -0.00922663 -0.0182991 0.0264961 0.0226489 -0.0111136 0.0149243 0.0877398 0.0450868 0.162531 0.0538473 0.04178 0.0187291 0.0161745 -0.250291 0.0866558 -0.052374 0.128253 0.296037 0.450879 -0.619565 0.259895 0.302462 0.245457 -0.655911 1 0.0679826 0.112819 0.0456203 0.025176 -0.0282073 -0.00861416 -0.102835
PlayerHeight_inches -0.00176908 -0.035411 -0.0128484 -0.00331484 -0.0973209 0.00368806 0.0111537 0.075049 0.112233 -0.0314935 -0.0144472 0.0460468 0.0498165 -0.0145416 -0.0236384 0.181879 0.00130746 0.0219289 -0.0256887 0.0240511 -0.00474451 -0.0293665 0.00289749 -0.0002924 0.00304456 0.0451191 0.0453396 -0.0443981 -0.0114731 0.0188322 0.0326116 -0.0613821 0.0679826 1 0.579497 -0.0944312 0.00919645 -0.0231913 0.0104527 -0.00668718
PlayerWeight 0.00201242 -0.016035 -0.0112693 -0.00541411 0.118113 -0.275713 0.161166 -0.0230671 0.0240982 -0.00794552 -0.0234314 0.0532846 0.0427893 0.0254921 -0.143347 0.278195 -0.00189203 0.0123897 -0.0160724 0.0286861 0.00867363 -0.045556 -0.00542442 -0.023961 0.0151964 0.0989499 0.0955152 -0.111976 0.00166273 0.0420612 0.0488133 -0.108888 0.112819 0.579497 1 0.0180552 0.0120604 0.0145043 -0.0121379 -0.0268061
Age_days 0.00243805 -0.0473321 -0.00578191 -0.00406912 0.0369576 0.0277457 0.0875658 -0.131825 0.0267159 0.0205515 -0.00279715 0.0135183 -0.00741878 0.0217452 0.0185569 -0.163255 -0.0182417 -0.0224977 -0.047541 0.02165 0.00082371 -0.000955586 -0.044016 -0.0414156 -0.0071515 0.0220654 0.0239303 -0.0340228 0.011674 0.0302634 0.00841702 -0.0460073 0.0456203 -0.0944312 0.0180552 1 0.00691691 0.00171035 0.0420757 -0.023494
Week 0.00840256 -0.0107138 0.00432249 0.0113771 -0.0174079 -0.00247579 0.0122184 0.0216787 -0.00674732 0.01397 -0.00672273 -0.0156466 -0.00805066 -0.00838878 -0.00100065 -0.0573048 -0.00113208 -0.0102956 -0.013461 -0.0132898 0.0136101 -0.0127805 -0.0332927 -0.0315401 -0.00522648 0.054205 -0.0364339 0.00620599 0.00940146 0.0318307 -0.0338281 0.0040335 0.025176 0.00919645 0.0120604 0.00691691 1 0.0433184 -0.593666 -0.000344592
Humidity -0.0232389 0.0499904 0.0122008 -0.0010606 -0.0156432 0.00305823 -0.00257052 0.020979 -0.00385267 0.312905 -0.398709 0.0107649 -0.028204 0.349886 -0.040752 -0.0505708 0.00583143 -0.0084899 -0.051114 -0.00646185 0.00237481 -0.00888934 -0.00614978 -0.0038571 -0.00264418 0.0558175 -0.00988556 -0.00176324 -0.0250473 -0.0419872 0.0292764 0.0147158 -0.0282073 -0.0231913 0.0145043 0.00171035 0.0433184 1 -0.199688 -0.00448253
Temperature 0.000470429 -0.0118665 -0.00177049 -0.0151276 0.020272 -0.00188456 -0.0125409 -0.0231456 0.0120687 -0.206623 0.12632 0.154285 0.155423 -0.147375 0.0145079 0.0289511 -0.0154099 0.00417279 0.00750955 0.0093175 -0.0130116 0.0191181 0.0164845 0.0156642 0.00254735 -0.0410761 0.0222385 -8.93363e-05 -0.00203799 0.0508485 -0.0360203 -0.0181336 -0.00861416 0.0104527 -0.0121379 0.0420757 -0.593666 -0.199688 1 -0.00943891
Yards 0.00712216 0.0281184 0.00346331 0.00638574 -0.0197326 0.0482147 -0.0150071 -0.00529897 -0.0136083 0.00644375 -0.00485295 -0.0136795 -0.00014642 -0.00588561 0.0519313 0.0277638 -0.0945703 -0.00881821 0.0340664 -0.000533937 -0.0234871 0.0706323 0.00146403 0.00270542 -0.000894301 -0.0367644 -0.049904 0.0586811 -0.00738116 -0.0324304 -0.0321461 0.0771527 -0.102835 -0.00668718 -0.0268061 -0.023494 -0.000344592 -0.00448253 -0.00943891 1
In [45]:
yards_df = corr[['Yards']].reindex(corr[['Yards']].Yards.abs().sort_values(ascending=False).index).drop(['Yards'])
yards_df
Out[45]:
Yards
DefendersInTheBox -0.102835
YardLine_refactor -0.094570
DB 0.077153
Distance 0.070632
WR 0.058681
Dis 0.051931
TE -0.049904
Runner_WR 0.048215
OL -0.036764
team_rushing_yards 0.034066
DL -0.032430
LB -0.032146
Shotgun 0.028118
ypg 0.027764
PlayerWeight -0.026806
Age_days -0.023494
Down -0.023487
Runner_RB -0.019733
Runner_FB -0.015007
Retractable_open -0.013679
Runner_QB -0.013608
Temperature -0.009439
TimeElapsed -0.008818
RB -0.007381
Team_Home 0.007122
PlayerHeight_inches -0.006687
Outdoors 0.006444
PlayDirection_Left 0.006386
rainy_weather -0.005886
Runner_HB -0.005299
Indoors -0.004853
Humidity -0.004483
Wildcat 0.003463
DefenseScore 0.002705
PossessionScore 0.001464
ScoreDifferential -0.000894
team_defense_rushing_yards -0.000534
Week -0.000345
Natural_Grass -0.000146

This correlation matrix is showing some of the collinearity issues that may arise due to the predictors being correlated. A few that stand out are Player Height and Player Weight, perhaps only one of these is needed in the model. Other noticable relationships are the offensive (OL, TE, RB and WR) position groups all have relatively high correlation to each other. This makes sense because if there are more in one group there are less than others (since each team always has 11 players split between the groups). There is a similar correlation between the defensive position groups (DL, LB and CB). Also, DB is highly negatively correlated with Defenders in the box (the box is an area near the line of scrimmage on the defensive side of the ball). This makes sense because typically DB's are outside of this box, perhaps removing DB as a predictor and using defenders in the box will improve collinearity.

Also, there is a high correlation between score differential and the offesnive teams score as well as a negative correlation between defense teams score and score differential. This makes sense because score differential was calculated from offense and defense score. These are all predictors that could be changed to remove some of this collinearity. Finally, both offense and defense scores are highly correlated to time elapsed, which makes sense because score can only increase as time elapsed increases.

There were other interesting interactions in the predictors as well. For example rushing yards per game seems to be positively correlated with player weight. Also a teams rushing yards allowed (on defense) is slightly negatively correlated to the number of Defensive Linemen. Another, Week number (from week 1 at the beginning of the season to 17 at the end of the season) is highly negatively correlated with temperature. This makes sense as the season starts in August and ends in December.

Before making any changes to the predictors, a Variation Inflation Factor method will

Variation Inflation Factor

In [46]:
VIF = pd.DataFrame([variance_inflation_factor(runners_scaled_df.values, i) 
               for i in range(runners_scaled_df.shape[1])], 
              index=runners_scaled_df.columns, columns=['VIF'])
VIF
d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\statsmodels\stats\outliers_influence.py:185: RuntimeWarning:

divide by zero encountered in double_scalars

Out[46]:
VIF
Team_Home 2.089428
Shotgun 2.196366
Wildcat 1.076844
PlayDirection_Left 2.019326
Runner_RB 18.440903
Runner_WR 1.855929
Runner_FB 1.216244
Runner_HB 1.673708
Runner_QB 1.115599
Outdoors 12.230027
Indoors 4.534477
Retractable_open 2.006764
Natural_Grass 3.450662
rainy_weather 1.270670
Dis 1.316724
ypg 1.342663
YardLine_refactor 1.130104
TimeElapsed 3.127147
team_rushing_yards 1.183278
team_defense_rushing_yards 1.068926
Down 1.388693
Distance 1.542104
PossessionScore inf
DefenseScore inf
ScoreDifferential inf
OL 6.590312
TE 32.955169
WR 48.106846
RB 13.035572
DL 486.685657
LB 505.447570
DB 346.477807
DefendersInTheBox 2.101514
PlayerHeight_inches 1.692627
PlayerWeight 1.910212
Age_days 1.090630
Week 1.697584
Humidity 1.429527
Temperature 1.959443

Remove variables due to Multi Collinearity

In [47]:
remove_cols = ['Runner_RB','Dis', 'Indoors','Natural_Grass','ScoreDifferential','RB','DB','PlayerHeight_inches','Week']
In [48]:
runners_scaled_response_drop_df = runners_scaled_response_df.drop(remove_cols, axis=1)
In [49]:
runners_scaled_drop_df = runners_scaled_df.drop(remove_cols, axis=1)
In [50]:
correlation_cols = runners_scaled_response_drop_df.columns
corr = runners_scaled_response_drop_df[correlation_cols].corr()
corr.style.background_gradient(cmap='coolwarm')
Out[50]:
Team_Home Shotgun Wildcat PlayDirection_Left Runner_WR Runner_FB Runner_HB Runner_QB Outdoors Retractable_open rainy_weather ypg YardLine_refactor TimeElapsed team_rushing_yards team_defense_rushing_yards Down Distance PossessionScore DefenseScore OL TE WR DL LB DefendersInTheBox PlayerWeight Age_days Humidity Temperature Yards
Team_Home 1 -0.00341153 -0.00749331 0.00157337 -0.00321319 -0.00493891 -0.00147214 -0.0067029 -0.00652207 0.0100885 -0.00150023 0.00197373 0.0162591 0.000848083 0.00259465 -0.1326 0.000361328 -0.00277397 0.0597553 -0.0713706 -0.00815497 0.0047435 -0.00718759 0.00203846 0.00481745 0.00769597 0.00201242 0.00243805 -0.0232389 0.000470429 0.00712216
Shotgun -0.00341153 1 -0.0353638 -0.00952423 -0.0516742 -0.0249678 0.0271001 0.0277453 0.03776 0.0177511 0.00341703 -0.0452128 -0.00653286 0.00701729 -0.0638755 0.0425688 0.0900239 0.0678293 -0.0348381 0.0615374 -0.128274 -0.305263 0.412041 -0.170177 -0.143377 -0.388805 -0.016035 -0.0473321 0.0499904 -0.0118665 0.0281184
Wildcat -0.00749331 -0.0353638 1 -0.00372767 0.0158516 -0.00411024 -0.00111373 0.0316344 -0.00743554 0.020094 0.00666319 -0.00855981 0.020427 0.000700311 0.0207578 0.000833483 0.0202763 -0.012824 -0.00899614 0.0068422 -0.00378182 -0.0226103 -0.0224333 -0.00607437 -0.0077001 -0.0136442 -0.0112693 -0.00578191 0.0122008 -0.00177049 0.00346331
PlayDirection_Left 0.00157337 -0.00952423 -0.00372767 1 0.0143858 -0.00356424 -0.00266064 0.00116202 -0.000415712 0.00594095 -0.00242505 -0.00190248 -0.0026083 -0.00848466 0.000521269 0.0014761 -0.00284757 -0.00976936 -0.00730163 -0.0069499 0.0110314 0.0090295 -0.0100106 0.00622715 -0.00114103 0.00579607 -0.00541411 -0.00406912 -0.0010606 -0.0151276 0.00638574
Runner_WR -0.00321319 -0.0516742 0.0158516 0.0143858 1 -0.013645 -0.0290132 -0.00937443 0.0133077 -0.0158014 -0.0108617 -0.297926 0.00496401 -0.037085 0.0325508 -0.0188538 -0.0201954 0.015126 -0.0172585 -0.0242332 -0.0268286 -0.0362206 0.0831503 0.0101626 -0.0456454 -0.0335469 -0.275713 0.0277457 0.00305823 -0.00188456 0.0482147
Runner_FB -0.00493891 -0.0249678 -0.00411024 -0.00356424 -0.013645 1 -0.0144293 -0.00466223 0.00413882 -0.0125428 0.0176281 -0.14546 0.0197126 0.0100858 0.0151859 -0.0112131 0.0616912 -0.0459006 0.00324248 -0.00518451 0.0324264 0.015037 -0.0671153 0.0189803 0.0391757 0.054647 0.161166 0.0875658 -0.00257052 -0.0125409 -0.0150071
Runner_HB -0.00147214 0.0271001 -0.00111373 -0.00266064 -0.0290132 -0.0144293 1 -0.00991323 0.0632323 -0.0267345 -0.0197113 -0.00659766 -0.00765926 0.00238722 -0.230702 0.0219427 -0.00180655 0.00300264 -0.0248978 0.0477899 -0.0139515 -0.0120879 0.0437906 -0.0393418 0.00528049 -0.0460815 -0.0230671 -0.131825 0.020979 -0.0231456 -0.00529897
Runner_QB -0.0067029 0.0277453 0.0316344 0.00116202 -0.00937443 -0.00466223 -0.00991323 1 0.00452035 -0.00950322 -0.00205357 -0.117624 0.0073269 0.00237402 0.0107059 -0.00414528 0.0141874 -0.012814 0.00294967 -0.0111759 -0.0110324 -0.00428908 0.0123747 -0.00223768 -0.00883362 -0.00922663 0.0240982 0.0267159 -0.00385267 0.0120687 -0.0136083
Outdoors -0.00652207 0.03776 -0.00743554 -0.000415712 0.0133077 0.00413882 0.0632323 0.00452035 1 -0.368592 0.113721 -0.0310499 0.00293028 0.000917795 -0.00971059 0.00267464 0.00520191 -0.00131068 -0.0141634 -0.00104975 -0.00673531 -0.00316586 0.006971 -0.0684959 0.0640662 -0.0182991 -0.00794552 0.0205515 0.312905 -0.206623 0.00644375
Retractable_open 0.0100885 0.0177511 0.020094 0.00594095 -0.0158014 -0.0125428 -0.0267345 -0.00950322 -0.368592 1 -0.0522747 0.0437359 -0.0205889 -0.00288711 -0.00450725 0.0386805 -0.0078597 -0.00241257 -0.00405114 -0.0152488 0.0226695 -0.00586478 -0.00631822 -0.0175827 0.0338212 0.0226489 0.0532846 0.0135183 0.0107649 0.154285 -0.0136795
rainy_weather -0.00150023 0.00341703 0.00666319 -0.00242505 -0.0108617 0.0176281 -0.0197113 -0.00205357 0.113721 -0.0522747 1 0.00319347 -0.00509531 0.00261859 0.0129912 0.0499115 0.00198092 0.0114542 -0.028428 -0.0373883 0.0188663 0.0139808 -0.0126869 -0.035616 0.0350712 0.0149243 0.0254921 0.0217452 0.349886 -0.147375 -0.00588561
ypg 0.00197373 -0.0452128 -0.00855981 -0.00190248 -0.297926 -0.14546 -0.00659766 -0.117624 -0.0310499 0.0437359 0.00319347 1 -0.0255412 -0.0489945 0.165125 -0.0085553 -0.0465834 -0.0082103 -0.0374414 -0.040623 0.00320314 0.0279883 -0.0423608 -0.00531003 0.0519233 0.0450868 0.278195 -0.163255 -0.0505708 0.0289511 0.0277638
YardLine_refactor 0.0162591 -0.00653286 0.020427 -0.0026083 0.00496401 0.0197126 -0.00765926 0.0073269 0.00293028 -0.0205889 -0.00509531 -0.0255412 1 0.0831257 0.0614055 0.00901266 0.0921483 -0.283417 0.0271057 0.0102749 0.0765109 0.0335952 -0.0737694 0.0687065 0.00400727 0.162531 -0.00189203 -0.0182417 0.00583143 -0.0154099 -0.0945703
TimeElapsed 0.000848083 0.00701729 0.000700311 -0.00848466 -0.037085 0.0100858 0.00238722 0.00237402 0.000917795 -0.00288711 0.00261859 -0.0489945 0.0831257 1 0.0421813 0.0209567 0.0625062 -0.0201877 0.732121 0.627461 0.0277457 0.0355594 -0.03709 0.00644398 0.00222673 0.0538473 0.0123897 -0.0224977 -0.0084899 0.00417279 -0.00881821
team_rushing_yards 0.00259465 -0.0638755 0.0207578 0.000521269 0.0325508 0.0151859 -0.230702 0.0107059 -0.00971059 -0.00450725 0.0129912 0.165125 0.0614055 0.0421813 1 0.0121524 0.020906 -0.0372798 0.129637 -0.0546376 -0.0137127 -0.0236307 -0.0507719 -0.00873156 0.0457337 0.04178 -0.0160724 -0.047541 -0.051114 0.00750955 0.0340664
team_defense_rushing_yards -0.1326 0.0425688 0.000833483 0.0014761 -0.0188538 -0.0112131 0.0219427 -0.00414528 0.00267464 0.0386805 0.0499115 -0.0085553 0.00901266 0.0209567 0.0121524 1 -0.00410109 0.00605925 0.0516031 -0.0111659 -9.5299e-05 0.0312594 -0.0252747 -0.113678 0.112031 0.0187291 0.0286861 0.02165 -0.00646185 0.0093175 -0.000533937
Down 0.000361328 0.0900239 0.0202763 -0.00284757 -0.0201954 0.0616912 -0.00180655 0.0141874 0.00520191 -0.0078597 0.00198092 -0.0465834 0.0921483 0.0625062 0.020906 -0.00410109 1 -0.490274 0.0365476 0.0140362 0.0570439 -0.0157759 -0.00686133 -0.0107126 -0.0233478 0.0161745 0.00867363 0.00082371 0.00237481 -0.0130116 -0.0234871
Distance -0.00277397 0.0678293 -0.012824 -0.00976936 0.015126 -0.0459006 0.00300264 -0.012814 -0.00131068 -0.00241257 0.0114542 -0.0082103 -0.283417 -0.0201877 -0.0372798 0.00605925 -0.490274 1 0.01134 -0.0169616 -0.122244 -0.0846761 0.152988 -0.108779 -0.0219124 -0.250291 -0.045556 -0.000955586 -0.00888934 0.0191181 0.0706323
PossessionScore 0.0597553 -0.0348381 -0.00899614 -0.00730163 -0.0172585 0.00324248 -0.0248978 0.00294967 -0.0141634 -0.00405114 -0.028428 -0.0374414 0.0271057 0.732121 0.129637 0.0516031 0.0365476 0.01134 1 0.399757 0.0120865 0.0592641 -0.0713552 0.0410116 0.00978239 0.0866558 -0.00542442 -0.044016 -0.00614978 0.0164845 0.00146403
DefenseScore -0.0713706 0.0615374 0.0068422 -0.0069499 -0.0242332 -0.00518451 0.0477899 -0.0111759 -0.00104975 -0.0152488 -0.0373883 -0.040623 0.0102749 0.627461 -0.0546376 -0.0111659 0.0140362 -0.0169616 0.399757 1 -0.00965613 -0.0373475 0.0526961 -0.0129549 -0.051624 -0.052374 -0.023961 -0.0414156 -0.0038571 0.0156642 0.00270542
OL -0.00815497 -0.128274 -0.00378182 0.0110314 -0.0268286 0.0324264 -0.0139515 -0.0110324 -0.00673531 0.0226695 0.0188663 0.00320314 0.0765109 0.0277457 -0.0137127 -9.5299e-05 0.0570439 -0.122244 0.0120865 -0.00965613 1 -0.0321402 -0.35143 0.149553 0.0963472 0.296037 0.0989499 0.0220654 0.0558175 -0.0410761 -0.0367644
TE 0.0047435 -0.305263 -0.0226103 0.0090295 -0.0362206 0.015037 -0.0120879 -0.00428908 -0.00316586 -0.00586478 0.0139808 0.0279883 0.0335952 0.0355594 -0.0236307 0.0312594 -0.0157759 -0.0846761 0.0592641 -0.0373475 -0.0321402 1 -0.762686 0.193071 0.241469 0.450879 0.0955152 0.0239303 -0.00988556 0.0222385 -0.049904
WR -0.00718759 0.412041 -0.0224333 -0.0100106 0.0831503 -0.0671153 0.0437906 0.0123747 0.006971 -0.00631822 -0.0126869 -0.0423608 -0.0737694 -0.03709 -0.0507719 -0.0252747 -0.00686133 0.152988 -0.0713552 0.0526961 -0.35143 -0.762686 1 -0.287083 -0.312721 -0.619565 -0.111976 -0.0340228 -0.00176324 -8.93363e-05 0.0586811
DL 0.00203846 -0.170177 -0.00607437 0.00622715 0.0101626 0.0189803 -0.0393418 -0.00223768 -0.0684959 -0.0175827 -0.035616 -0.00531003 0.0687065 0.00644398 -0.00873156 -0.113678 -0.0107126 -0.108779 0.0410116 -0.0129549 0.149553 0.193071 -0.287083 1 -0.651873 0.302462 0.0420612 0.0302634 -0.0419872 0.0508485 -0.0324304
LB 0.00481745 -0.143377 -0.0077001 -0.00114103 -0.0456454 0.0391757 0.00528049 -0.00883362 0.0640662 0.0338212 0.0350712 0.0519233 0.00400727 0.00222673 0.0457337 0.112031 -0.0233478 -0.0219124 0.00978239 -0.051624 0.0963472 0.241469 -0.312721 -0.651873 1 0.245457 0.0488133 0.00841702 0.0292764 -0.0360203 -0.0321461
DefendersInTheBox 0.00769597 -0.388805 -0.0136442 0.00579607 -0.0335469 0.054647 -0.0460815 -0.00922663 -0.0182991 0.0226489 0.0149243 0.0450868 0.162531 0.0538473 0.04178 0.0187291 0.0161745 -0.250291 0.0866558 -0.052374 0.296037 0.450879 -0.619565 0.302462 0.245457 1 0.112819 0.0456203 -0.0282073 -0.00861416 -0.102835
PlayerWeight 0.00201242 -0.016035 -0.0112693 -0.00541411 -0.275713 0.161166 -0.0230671 0.0240982 -0.00794552 0.0532846 0.0254921 0.278195 -0.00189203 0.0123897 -0.0160724 0.0286861 0.00867363 -0.045556 -0.00542442 -0.023961 0.0989499 0.0955152 -0.111976 0.0420612 0.0488133 0.112819 1 0.0180552 0.0145043 -0.0121379 -0.0268061
Age_days 0.00243805 -0.0473321 -0.00578191 -0.00406912 0.0277457 0.0875658 -0.131825 0.0267159 0.0205515 0.0135183 0.0217452 -0.163255 -0.0182417 -0.0224977 -0.047541 0.02165 0.00082371 -0.000955586 -0.044016 -0.0414156 0.0220654 0.0239303 -0.0340228 0.0302634 0.00841702 0.0456203 0.0180552 1 0.00171035 0.0420757 -0.023494
Humidity -0.0232389 0.0499904 0.0122008 -0.0010606 0.00305823 -0.00257052 0.020979 -0.00385267 0.312905 0.0107649 0.349886 -0.0505708 0.00583143 -0.0084899 -0.051114 -0.00646185 0.00237481 -0.00888934 -0.00614978 -0.0038571 0.0558175 -0.00988556 -0.00176324 -0.0419872 0.0292764 -0.0282073 0.0145043 0.00171035 1 -0.199688 -0.00448253
Temperature 0.000470429 -0.0118665 -0.00177049 -0.0151276 -0.00188456 -0.0125409 -0.0231456 0.0120687 -0.206623 0.154285 -0.147375 0.0289511 -0.0154099 0.00417279 0.00750955 0.0093175 -0.0130116 0.0191181 0.0164845 0.0156642 -0.0410761 0.0222385 -8.93363e-05 0.0508485 -0.0360203 -0.00861416 -0.0121379 0.0420757 -0.199688 1 -0.00943891
Yards 0.00712216 0.0281184 0.00346331 0.00638574 0.0482147 -0.0150071 -0.00529897 -0.0136083 0.00644375 -0.0136795 -0.00588561 0.0277638 -0.0945703 -0.00881821 0.0340664 -0.000533937 -0.0234871 0.0706323 0.00146403 0.00270542 -0.0367644 -0.049904 0.0586811 -0.0324304 -0.0321461 -0.102835 -0.0268061 -0.023494 -0.00448253 -0.00943891 1
In [51]:
yards_df = corr[['Yards']].reindex(corr[['Yards']].Yards.abs().sort_values(ascending=False).index).drop(['Yards'])
yards_df
Out[51]:
Yards
DefendersInTheBox -0.102835
YardLine_refactor -0.094570
Distance 0.070632
WR 0.058681
TE -0.049904
Runner_WR 0.048215
OL -0.036764
team_rushing_yards 0.034066
DL -0.032430
LB -0.032146
Shotgun 0.028118
ypg 0.027764
PlayerWeight -0.026806
Age_days -0.023494
Down -0.023487
Runner_FB -0.015007
Retractable_open -0.013679
Runner_QB -0.013608
Temperature -0.009439
TimeElapsed -0.008818
Team_Home 0.007122
Outdoors 0.006444
PlayDirection_Left 0.006386
rainy_weather -0.005886
Runner_HB -0.005299
Humidity -0.004483
Wildcat 0.003463
DefenseScore 0.002705
PossessionScore 0.001464
team_defense_rushing_yards -0.000534
In [52]:
VIF = pd.DataFrame([variance_inflation_factor(runners_scaled_drop_df.values, i) 
               for i in range(runners_scaled_drop_df.shape[1])], 
              index=runners_scaled_drop_df.columns, columns=['VIF'])
VIF
Out[52]:
VIF
Team_Home 1.799020
Shotgun 1.803147
Wildcat 1.016135
PlayDirection_Left 1.730657
Runner_WR 1.210959
Runner_FB 1.097497
Runner_HB 1.122751
Runner_QB 1.027735
Outdoors 2.363331
Retractable_open 1.152255
rainy_weather 1.233156
ypg 1.327491
YardLine_refactor 1.127593
TimeElapsed 3.115660
team_rushing_yards 1.170042
team_defense_rushing_yards 1.055507
Down 1.385467
Distance 1.540213
PossessionScore 2.301371
DefenseScore 1.743145
OL 1.578521
TE 3.303691
WR 5.167899
DL 3.915869
LB 3.864423
DefendersInTheBox 2.092871
PlayerWeight 1.228166
Age_days 1.071451
Humidity 1.240629
Temperature 1.091552

Define X and y variables

In [509]:
X = runners_scaled_drop_df
X = X.astype(float)
y = runners[['Yards']]
y = y.astype(float)

Split into Train and Test Sets

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
In [1151]:
y_train_regression = runners[['Yards']].iloc[X_train.index]
y_test_regression = runners[['Yards']].iloc[X_test.index]

Linear Regression Model (for calculating basic diagnostic statistics)

A Multiple Linear Regression will now be calculated on full data set to get a summary of statistics from the ordinary least squares analysis. This will give more insight into which predictors in particular are important to this regression

In [510]:
X_sm = X
In [511]:
X_sm = sm.add_constant(X_sm)
In [512]:
model = sm.OLS(y,X_sm).fit()
model.summary()
Out[512]:
OLS Regression Results
Dep. Variable: Yards R-squared: 0.025
Model: OLS Adj. R-squared: 0.024
Method: Least Squares F-statistic: 26.10
Date: Fri, 28 Aug 2020 Prob (F-statistic): 2.02e-143
Time: 13:18:04 Log-Likelihood: -1.0141e+05
No. Observations: 31007 AIC: 2.029e+05
Df Residuals: 30976 BIC: 2.031e+05
Df Model: 30
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 4.0756 0.096 42.461 0.000 3.887 4.264
Team_Home 0.1205 0.074 1.633 0.102 -0.024 0.265
Shotgun 0.0102 0.088 0.117 0.907 -0.162 0.182
Wildcat 0.1519 0.732 0.207 0.836 -1.284 1.588
PlayDirection_Left 0.0779 0.072 1.075 0.282 -0.064 0.220
Runner_WR 2.2429 0.245 9.171 0.000 1.764 2.722
Runner_FB 0.0821 0.462 0.178 0.859 -0.824 0.988
Runner_HB -0.1072 0.222 -0.482 0.630 -0.543 0.329
Runner_QB -0.8678 0.649 -1.336 0.182 -2.141 0.405
Outdoors 0.0334 0.090 0.371 0.710 -0.143 0.210
Retractable_open -0.2982 0.168 -1.775 0.076 -0.627 0.031
rainy_weather -0.1579 0.154 -1.029 0.304 -0.459 0.143
ypg 0.2790 0.042 6.688 0.000 0.197 0.361
YardLine_refactor -0.4835 0.038 -12.579 0.000 -0.559 -0.408
TimeElapsed -0.0147 0.064 -0.230 0.818 -0.140 0.111
team_rushing_yards 0.1922 0.039 4.910 0.000 0.115 0.269
team_defense_rushing_yards 0.0337 0.037 0.905 0.365 -0.039 0.107
Down -0.0029 0.043 -0.069 0.945 -0.086 0.081
Distance 0.1891 0.045 4.211 0.000 0.101 0.277
PossessionScore 0.0542 0.055 0.987 0.324 -0.053 0.162
DefenseScore 0.0081 0.048 0.169 0.866 -0.086 0.102
OL -0.0748 0.045 -1.645 0.100 -0.164 0.014
TE -0.1999 0.066 -3.038 0.002 -0.329 -0.071
WR -0.3504 0.082 -4.254 0.000 -0.512 -0.189
DL -0.2434 0.072 -3.398 0.001 -0.384 -0.103
LB -0.2986 0.071 -4.197 0.000 -0.438 -0.159
DefendersInTheBox -0.4956 0.052 -9.457 0.000 -0.598 -0.393
PlayerWeight -0.0666 0.040 -1.661 0.097 -0.145 0.012
Age_days -0.0823 0.037 -2.198 0.028 -0.156 -0.009
Humidity -0.0176 0.042 -0.423 0.672 -0.099 0.064
Temperature -0.0752 0.038 -1.983 0.047 -0.149 -0.001
Omnibus: 28540.185 Durbin-Watson: 1.995
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1662691.963
Skew: 4.315 Prob(JB): 0.00
Kurtosis: 37.821 Cond. No. 33.7


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [58]:
model.pvalues[model.pvalues < 0.05]
Out[58]:
const                 0.000000e+00
Runner_WR             4.970440e-20
ypg                   2.298644e-11
YardLine_refactor     3.395599e-36
team_rushing_yards    9.152625e-07
Distance              2.550512e-05
TE                    2.379680e-03
WR                    2.102963e-05
DL                    6.798331e-04
LB                    2.717258e-05
DefendersInTheBox     3.387820e-21
Age_days              2.797423e-02
Temperature           4.741948e-02
dtype: float64

From this ordinary least squares analysis, the r^2 value is extremely low. This makes sense because it is such a complex problem with many 'human factors' involved. However, as the saying goes, Football is a 'game of inches' meaning that any insight at all, however low the r^2 value could be valuable for teams gaining the upper hand in strategy on their opponents.

The p values show that a few variable have more statistical significance than others, including if the runner is a Wide Receiver, the yards per game for the runner, the field position of the offense, the distance to first down and others.

Diagnostic Plots

In [513]:
residuals = model.resid
fitted = model.fittedvalues
smoothed = lowess(residuals,fitted)

Residual Plot

In [514]:
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Residuals')
ax.set_xlabel('Fitted Values')
ax.set_title('Residuals vs. Fitted')
ax.plot([min(fitted),max(fitted)],[0,0],color = 'k',linestyle = ':', alpha = .3)
Out[514]:
[<matplotlib.lines.Line2D at 0x1e98614d248>]

This Residual plot shows many values with very high residuals and isn't evenly scattered, so perhaps regression isn't the best choice for this complex problem.

Normal Q-Q Plot

In [515]:
sorted_student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
sorted_student_residuals.index = model.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
In [516]:
df = pd.DataFrame(sorted_student_residuals)
df.columns = ['sorted_student_residuals']
df['theoretical_quantiles'] = stats.probplot(df['sorted_student_residuals'], dist = 'norm', fit = False)[0]
rankings = abs(df['sorted_student_residuals']).sort_values(ascending = False)

fig, ax = plt.subplots()
x = df['theoretical_quantiles']
y = df['sorted_student_residuals']
ax.scatter(x,y, edgecolor = 'k',facecolor = 'none')
ax.set_title('Normal Q-Q')
ax.set_ylabel('Standardized Residuals')
ax.set_xlabel('Theoretical Quantiles')
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = 'r', ls = '--')
Out[516]:
[<matplotlib.lines.Line2D at 0x1e9861d6408>]

This plot confirms that the distribution of yards is right skewed, with many large outliers.

Scale-Location Plot

In [517]:
student_residuals = model.get_influence().resid_studentized_internal
sqrt_student_residuals = pd.Series(np.sqrt(np.abs(student_residuals)))
sqrt_student_residuals.index = model.resid.index
smoothed = lowess(sqrt_student_residuals,fitted)

fig, ax = plt.subplots()
ax.scatter(fitted, sqrt_student_residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('$\sqrt{|Studentized \ Residuals|}$')
ax.set_xlabel('Fitted Values')
ax.set_title('Scale-Location')
ax.set_ylim(0,max(sqrt_student_residuals)+0.1)
Out[517]:
(0, 3.937091275411568)

This plot shows that there is one outlier, and the residuals aren't completely random across the range of predictors.residuals spread wider along the axis.

Residuals vs. Leverage Plot

In [518]:
student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
student_residuals.index = model.resid.index
df = pd.DataFrame(student_residuals)
df.columns = ['student_residuals']
df['leverage'] = model.get_influence().hat_matrix_diag
smoothed = lowess(df['student_residuals'],df['leverage'])
sorted_student_residuals = abs(df['student_residuals']).sort_values(ascending = False)
In [519]:
fig, ax = plt.subplots()
x = df['leverage']
y = df['student_residuals']
xpos = max(x)+max(x)*0.01  
ax.scatter(x, y, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Studentized Residuals')
ax.set_xlabel('Leverage')
ax.set_title('Residuals vs. Leverage')
ax.set_ylim(min(y)-min(y)*0.15,max(y)+max(y)*0.15)
ax.set_xlim(-0.01,max(x)+max(x)*0.05)
plt.tight_layout()

cooksx = np.linspace(min(x), xpos, 50)
p = len(model.params)
poscooks1y = np.sqrt((p*(1-cooksx))/cooksx)
poscooks05y = np.sqrt(0.5*(p*(1-cooksx))/cooksx)
negcooks1y = -np.sqrt((p*(1-cooksx))/cooksx)
negcooks05y = -np.sqrt(0.5*(p*(1-cooksx))/cooksx)

ax.plot(cooksx,poscooks1y,label = "Cook's Distance", ls = ':', color = 'r')
ax.plot(cooksx,poscooks05y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks1y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks05y, ls = ':', color = 'r')
ax.plot([0,0],ax.get_ylim(), ls=":", alpha = .3, color = 'k')
ax.plot(ax.get_xlim(), [0,0], ls=":", alpha = .3, color = 'k')
ax.annotate('1.0', xy = (xpos, poscooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, poscooks05y[-1]), color = 'r')
ax.annotate('1.0', xy = (xpos, negcooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, negcooks05y[-1]), color = 'r')
ax.legend()
plt.show()

This plot shows that all cases are inside Cook's Distance lines, so there are no high leverage points.

These linear regression diagnostic plots show that perhaps linear regression is not the best solution to this complex problem. Since the data is right skewed, with many outliers of high rushing yards, perhaps some featuring could be done to remove these data points or refactor them with some transformation. For the sake of comparing models, linear regression will still be used but with the knowledge that some of the issues that may arise come from these outliers and non-linearity that comes from the complexity of the problem.

Synergy Interaction Effects Study

In [59]:
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
x_interaction = poly.fit_transform(X_sm)
In [60]:
interaction_df = pd.DataFrame(x_interaction, index=X_sm.index, columns=poly.get_feature_names(X_sm.columns))
In [61]:
model = sm.OLS(y,interaction_df).fit()
model.summary()
Out[61]:
OLS Regression Results
Dep. Variable: Yards R-squared: 0.047
Model: OLS Adj. R-squared: 0.033
Method: Least Squares F-statistic: 3.321
Date: Fri, 28 Aug 2020 Prob (F-statistic): 8.14e-110
Time: 01:29:33 Log-Likelihood: -1.0105e+05
No. Observations: 31007 AIC: 2.030e+05
Df Residuals: 30550 BIC: 2.068e+05
Df Model: 456
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 4.1946 0.179 23.413 0.000 3.843 4.546
Team_Home 0.0200 0.093 0.214 0.830 -0.163 0.203
Shotgun -0.0665 0.118 -0.566 0.572 -0.297 0.164
Wildcat 1.1095 1.612 0.688 0.491 -2.050 4.269
PlayDirection_Left 0.1721 0.089 1.927 0.054 -0.003 0.347
Runner_WR 12.2255 0.962 12.710 0.000 10.340 14.111
Runner_FB 1.3504 3.060 0.441 0.659 -4.646 7.347
Runner_HB -0.0717 1.791 -0.040 0.968 -3.582 3.439
Runner_QB 15.0147 3.499 4.292 0.000 8.157 21.872
Outdoors 0.0954 0.090 1.058 0.290 -0.081 0.272
Retractable_open -0.1558 0.188 -0.829 0.407 -0.524 0.213
rainy_weather 0.0101 0.413 0.024 0.981 -0.800 0.820
ypg 0.2206 0.057 3.860 0.000 0.109 0.333
YardLine_refactor -0.1812 0.051 -3.534 0.000 -0.282 -0.081
TimeElapsed 0.0962 0.088 1.095 0.274 -0.076 0.269
team_rushing_yards 0.1043 0.054 1.937 0.053 -0.001 0.210
team_defense_rushing_yards 0.0122 0.051 0.239 0.811 -0.088 0.112
Down 0.0791 0.060 1.324 0.186 -0.038 0.196
Distance 0.1085 0.064 1.698 0.090 -0.017 0.234
PossessionScore 0.0595 0.077 0.775 0.439 -0.091 0.210
DefenseScore -0.0736 0.067 -1.094 0.274 -0.206 0.058
OL -0.0453 0.073 -0.622 0.534 -0.188 0.097
TE -0.0236 0.092 -0.256 0.798 -0.204 0.157
WR -0.0843 0.114 -0.739 0.460 -0.308 0.139
DL -0.0805 0.105 -0.768 0.443 -0.286 0.125
LB -0.1151 0.099 -1.160 0.246 -0.310 0.079
DefendersInTheBox -0.2631 0.075 -3.530 0.000 -0.409 -0.117
PlayerWeight -0.0985 0.056 -1.771 0.077 -0.207 0.011
Age_days -0.0464 0.051 -0.903 0.366 -0.147 0.054
Humidity 0.0224 0.048 0.467 0.641 -0.072 0.116
Temperature -0.0912 0.058 -1.586 0.113 -0.204 0.022
const Team_Home 0.0200 0.093 0.214 0.830 -0.163 0.203
const Shotgun -0.0665 0.118 -0.566 0.572 -0.297 0.164
const Wildcat 1.1095 1.612 0.688 0.491 -2.050 4.269
const PlayDirection_Left 0.1721 0.089 1.927 0.054 -0.003 0.347
const Runner_WR 12.2255 0.962 12.710 0.000 10.340 14.111
const Runner_FB 1.3504 3.060 0.441 0.659 -4.646 7.347
const Runner_HB -0.0717 1.791 -0.040 0.968 -3.582 3.439
const Runner_QB 15.0147 3.499 4.292 0.000 8.157 21.872
const Outdoors 0.0954 0.090 1.058 0.290 -0.081 0.272
const Retractable_open -0.1558 0.188 -0.829 0.407 -0.524 0.213
const rainy_weather 0.0101 0.413 0.024 0.981 -0.800 0.820
const ypg 0.2206 0.057 3.860 0.000 0.109 0.333
const YardLine_refactor -0.1812 0.051 -3.534 0.000 -0.282 -0.081
const TimeElapsed 0.0962 0.088 1.095 0.274 -0.076 0.269
const team_rushing_yards 0.1043 0.054 1.937 0.053 -0.001 0.210
const team_defense_rushing_yards 0.0122 0.051 0.239 0.811 -0.088 0.112
const Down 0.0791 0.060 1.324 0.186 -0.038 0.196
const Distance 0.1085 0.064 1.698 0.090 -0.017 0.234
const PossessionScore 0.0595 0.077 0.775 0.439 -0.091 0.210
const DefenseScore -0.0736 0.067 -1.094 0.274 -0.206 0.058
const OL -0.0453 0.073 -0.622 0.534 -0.188 0.097
const TE -0.0236 0.092 -0.256 0.798 -0.204 0.157
const WR -0.0843 0.114 -0.739 0.460 -0.308 0.139
const DL -0.0805 0.105 -0.768 0.443 -0.286 0.125
const LB -0.1151 0.099 -1.160 0.246 -0.310 0.079
const DefendersInTheBox -0.2631 0.075 -3.530 0.000 -0.409 -0.117
const PlayerWeight -0.0985 0.056 -1.771 0.077 -0.207 0.011
const Age_days -0.0464 0.051 -0.903 0.366 -0.147 0.054
const Humidity 0.0224 0.048 0.467 0.641 -0.072 0.116
const Temperature -0.0912 0.058 -1.586 0.113 -0.204 0.022
Team_Home Shotgun 0.3950 0.180 2.191 0.028 0.042 0.748
Team_Home Wildcat -0.9313 1.949 -0.478 0.633 -4.751 2.889
Team_Home PlayDirection_Left 0.1654 0.148 1.117 0.264 -0.125 0.456
Team_Home Runner_WR -0.6841 0.505 -1.356 0.175 -1.673 0.305
Team_Home Runner_FB -0.6005 1.001 -0.600 0.549 -2.562 1.361
Team_Home Runner_HB -0.1146 0.512 -0.224 0.823 -1.119 0.890
Team_Home Runner_QB -1.0054 1.565 -0.643 0.520 -4.072 2.061
Team_Home Outdoors -0.1454 0.191 -0.763 0.446 -0.519 0.228
Team_Home Retractable_open -0.2689 0.379 -0.710 0.478 -1.011 0.473
Team_Home rainy_weather -0.2572 0.336 -0.765 0.444 -0.916 0.401
Team_Home ypg -0.0997 0.087 -1.143 0.253 -0.270 0.071
Team_Home YardLine_refactor -0.0002 0.079 -0.003 0.998 -0.155 0.155
Team_Home TimeElapsed -0.1834 0.133 -1.382 0.167 -0.443 0.077
Team_Home team_rushing_yards 0.0194 0.084 0.230 0.818 -0.145 0.184
Team_Home team_defense_rushing_yards 0.0886 0.082 1.081 0.280 -0.072 0.249
Team_Home Down 0.0051 0.088 0.059 0.953 -0.166 0.177
Team_Home Distance -0.0038 0.093 -0.041 0.967 -0.186 0.178
Team_Home PossessionScore 0.0020 0.115 0.018 0.986 -0.223 0.227
Team_Home DefenseScore 0.1026 0.101 1.019 0.308 -0.095 0.300
Team_Home OL -0.0336 0.096 -0.352 0.725 -0.221 0.154
Team_Home TE -0.1143 0.136 -0.838 0.402 -0.382 0.153
Team_Home WR -0.1752 0.170 -1.030 0.303 -0.509 0.158
Team_Home DL 0.0171 0.150 0.114 0.909 -0.277 0.312
Team_Home LB 0.0134 0.149 0.090 0.928 -0.278 0.305
Team_Home DefendersInTheBox -0.0038 0.111 -0.034 0.973 -0.222 0.214
Team_Home PlayerWeight 0.1127 0.084 1.343 0.179 -0.052 0.277
Team_Home Age_days 0.0254 0.078 0.324 0.746 -0.128 0.179
Team_Home Humidity -0.0922 0.089 -1.031 0.303 -0.268 0.083
Team_Home Temperature -0.0203 0.079 -0.257 0.797 -0.176 0.135
Shotgun Wildcat 1.385e-14 3.85e-14 0.359 0.719 -6.17e-14 8.94e-14
Shotgun PlayDirection_Left -0.3755 0.176 -2.135 0.033 -0.720 -0.031
Shotgun Runner_WR -0.3271 0.658 -0.497 0.619 -1.617 0.963
Shotgun Runner_FB 0.0148 1.600 0.009 0.993 -3.121 3.151
Shotgun Runner_HB -0.2633 0.529 -0.498 0.618 -1.300 0.773
Shotgun Runner_QB 1.3867 1.709 0.811 0.417 -1.964 4.737
Shotgun Outdoors 0.0521 0.224 0.232 0.816 -0.388 0.492
Shotgun Retractable_open 0.1836 0.417 0.441 0.660 -0.633 1.000
Shotgun rainy_weather 0.3767 0.381 0.990 0.322 -0.369 1.123
Shotgun ypg -0.1221 0.102 -1.202 0.230 -0.321 0.077
Shotgun YardLine_refactor -0.0058 0.095 -0.062 0.951 -0.192 0.180
Shotgun TimeElapsed -0.1512 0.159 -0.950 0.342 -0.463 0.161
Shotgun team_rushing_yards 0.1019 0.099 1.027 0.304 -0.093 0.296
Shotgun team_defense_rushing_yards 0.1551 0.091 1.698 0.089 -0.024 0.334
Shotgun Down 0.0183 0.107 0.170 0.865 -0.192 0.229
Shotgun Distance -0.0180 0.112 -0.161 0.872 -0.237 0.201
Shotgun PossessionScore -0.0622 0.136 -0.458 0.647 -0.329 0.204
Shotgun DefenseScore 0.0498 0.117 0.427 0.669 -0.179 0.278
Shotgun OL -0.0346 0.146 -0.237 0.813 -0.321 0.251
Shotgun TE 0.3606 0.198 1.824 0.068 -0.027 0.748
Shotgun WR 0.3219 0.227 1.419 0.156 -0.123 0.767
Shotgun DL 0.1152 0.175 0.659 0.510 -0.227 0.458
Shotgun LB 0.1034 0.174 0.593 0.553 -0.238 0.445
Shotgun DefendersInTheBox -0.1364 0.130 -1.047 0.295 -0.392 0.119
Shotgun PlayerWeight 0.0168 0.100 0.168 0.867 -0.179 0.213
Shotgun Age_days -0.1716 0.093 -1.847 0.065 -0.354 0.010
Shotgun Humidity -0.0434 0.104 -0.417 0.677 -0.247 0.161
Shotgun Temperature 0.1084 0.094 1.154 0.249 -0.076 0.293
Wildcat PlayDirection_Left -0.6782 2.073 -0.327 0.744 -4.742 3.386
Wildcat Runner_WR 4.0327 5.186 0.778 0.437 -6.132 14.198
Wildcat Runner_FB -3.085e-14 4.79e-14 -0.645 0.519 -1.25e-13 6.3e-14
Wildcat Runner_HB -7.9448 6.131 -1.296 0.195 -19.961 4.071
Wildcat Runner_QB 15.8773 6.784 2.340 0.019 2.580 29.175
Wildcat Outdoors -1.9113 2.390 -0.800 0.424 -6.595 2.773
Wildcat Retractable_open 1.0311 3.996 0.258 0.796 -6.801 8.864
Wildcat rainy_weather -0.9532 3.989 -0.239 0.811 -8.772 6.866
Wildcat ypg 3.9631 1.620 2.447 0.014 0.788 7.138
Wildcat YardLine_refactor 1.1020 1.112 0.991 0.322 -1.078 3.282
Wildcat TimeElapsed 0.2776 1.879 0.148 0.883 -3.404 3.960
Wildcat team_rushing_yards -1.5935 1.392 -1.145 0.252 -4.322 1.135
Wildcat team_defense_rushing_yards 0.3153 0.843 0.374 0.708 -1.338 1.968
Wildcat Down 2.3164 1.156 2.003 0.045 0.050 4.583
Wildcat Distance 1.5115 1.158 1.305 0.192 -0.758 3.781
Wildcat PossessionScore -0.5798 1.680 -0.345 0.730 -3.873 2.713
Wildcat DefenseScore 0.2066 1.401 0.148 0.883 -2.539 2.952
Wildcat OL 0.8093 1.252 0.646 0.518 -1.645 3.264
Wildcat TE 1.0400 1.329 0.783 0.434 -1.564 3.644
Wildcat WR 0.6986 1.798 0.389 0.698 -2.826 4.223
Wildcat DL -1.2334 1.563 -0.789 0.430 -4.297 1.830
Wildcat LB -0.1301 1.791 -0.073 0.942 -3.641 3.381
Wildcat DefendersInTheBox 0.4697 1.401 0.335 0.737 -2.275 3.215
Wildcat PlayerWeight -0.6431 1.141 -0.564 0.573 -2.879 1.593
Wildcat Age_days 0.4020 1.803 0.223 0.824 -3.131 3.935
Wildcat Humidity 0.5344 1.080 0.495 0.621 -1.582 2.650
Wildcat Temperature -0.5951 1.179 -0.505 0.614 -2.906 1.716
PlayDirection_Left Runner_WR 0.1025 0.497 0.206 0.837 -0.872 1.077
PlayDirection_Left Runner_FB -0.3354 1.011 -0.332 0.740 -2.318 1.647
PlayDirection_Left Runner_HB 1.0786 0.450 2.396 0.017 0.196 1.961
PlayDirection_Left Runner_QB -2.1364 1.526 -1.400 0.161 -5.127 0.854
PlayDirection_Left Outdoors -0.3625 0.181 -2.004 0.045 -0.717 -0.008
PlayDirection_Left Retractable_open 0.0638 0.340 0.188 0.851 -0.603 0.730
PlayDirection_Left rainy_weather -0.0086 0.310 -0.028 0.978 -0.617 0.600
PlayDirection_Left ypg -0.1392 0.084 -1.661 0.097 -0.303 0.025
PlayDirection_Left YardLine_refactor 0.0074 0.077 0.096 0.924 -0.144 0.159
PlayDirection_Left TimeElapsed -0.1892 0.129 -1.462 0.144 -0.443 0.064
PlayDirection_Left team_rushing_yards 0.0470 0.079 0.596 0.551 -0.107 0.201
PlayDirection_Left team_defense_rushing_yards -0.0072 0.075 -0.096 0.924 -0.154 0.140
PlayDirection_Left Down -0.0197 0.086 -0.229 0.819 -0.188 0.149
PlayDirection_Left Distance 0.0496 0.091 0.544 0.586 -0.129 0.228
PlayDirection_Left PossessionScore -0.1145 0.111 -1.030 0.303 -0.332 0.103
PlayDirection_Left DefenseScore 0.1216 0.097 1.257 0.209 -0.068 0.311
PlayDirection_Left OL -0.0181 0.092 -0.196 0.844 -0.199 0.163
PlayDirection_Left TE -0.0373 0.132 -0.282 0.778 -0.296 0.222
PlayDirection_Left WR -0.0728 0.166 -0.439 0.661 -0.398 0.253
PlayDirection_Left DL -0.2214 0.145 -1.531 0.126 -0.505 0.062
PlayDirection_Left LB -0.2124 0.144 -1.479 0.139 -0.494 0.069
PlayDirection_Left DefendersInTheBox 0.1297 0.108 1.204 0.229 -0.081 0.341
PlayDirection_Left PlayerWeight 0.1301 0.081 1.612 0.107 -0.028 0.288
PlayDirection_Left Age_days 0.0367 0.075 0.487 0.626 -0.111 0.184
PlayDirection_Left Humidity 0.0008 0.084 0.009 0.993 -0.164 0.165
PlayDirection_Left Temperature -0.0368 0.076 -0.483 0.629 -0.186 0.113
Runner_WR Runner_FB 6.303e-15 4.55e-14 0.138 0.890 -8.3e-14 9.56e-14
Runner_WR Runner_HB 7.076e-14 6.14e-14 1.153 0.249 -4.95e-14 1.91e-13
Runner_WR Runner_QB -2.391e-14 2.52e-14 -0.950 0.342 -7.32e-14 2.54e-14
Runner_WR Outdoors 1.1306 0.651 1.736 0.083 -0.146 2.407
Runner_WR Retractable_open 0.6072 1.415 0.429 0.668 -2.166 3.380
Runner_WR rainy_weather 1.1425 1.176 0.972 0.331 -1.162 3.447
Runner_WR ypg 12.5323 0.988 12.685 0.000 10.596 14.469
Runner_WR YardLine_refactor -0.8028 0.277 -2.896 0.004 -1.346 -0.260
Runner_WR TimeElapsed 0.4471 0.468 0.955 0.340 -0.470 1.365
Runner_WR team_rushing_yards -0.7680 0.267 -2.875 0.004 -1.291 -0.244
Runner_WR team_defense_rushing_yards -0.2971 0.269 -1.105 0.269 -0.824 0.230
Runner_WR Down -0.8586 0.344 -2.496 0.013 -1.533 -0.184
Runner_WR Distance -1.0239 0.394 -2.598 0.009 -1.796 -0.251
Runner_WR PossessionScore -0.4466 0.402 -1.111 0.267 -1.234 0.341
Runner_WR DefenseScore -0.3666 0.348 -1.054 0.292 -1.048 0.315
Runner_WR OL -0.4298 0.426 -1.009 0.313 -1.265 0.405
Runner_WR TE -1.0339 0.461 -2.241 0.025 -1.938 -0.129
Runner_WR WR -1.5640 0.497 -3.147 0.002 -2.538 -0.590
Runner_WR DL -0.5921 0.510 -1.161 0.246 -1.592 0.408
Runner_WR LB -0.8653 0.505 -1.714 0.087 -1.855 0.124
Runner_WR DefendersInTheBox 0.1094 0.389 0.281 0.779 -0.654 0.872
Runner_WR PlayerWeight -0.3171 0.200 -1.582 0.114 -0.710 0.076
Runner_WR Age_days -0.4460 0.314 -1.420 0.156 -1.062 0.170
Runner_WR Humidity -0.4438 0.287 -1.548 0.122 -1.006 0.118
Runner_WR Temperature 0.4842 0.274 1.766 0.077 -0.053 1.022
Runner_FB Runner_HB 7.613e-14 4.12e-14 1.848 0.065 -4.59e-15 1.57e-13
Runner_FB Runner_QB -7.573e-14 4.08e-14 -1.857 0.063 -1.56e-13 4.19e-15
Runner_FB Outdoors 0.7775 1.330 0.585 0.559 -1.830 3.385
Runner_FB Retractable_open -0.0684 3.640 -0.019 0.985 -7.204 7.067
Runner_FB rainy_weather 1.7416 1.786 0.975 0.330 -1.759 5.243
Runner_FB ypg 1.3221 2.765 0.478 0.632 -4.097 6.741
Runner_FB YardLine_refactor 0.3639 0.502 0.725 0.469 -0.620 1.348
Runner_FB TimeElapsed 0.1422 0.919 0.155 0.877 -1.660 1.944
Runner_FB team_rushing_yards 0.1418 0.741 0.191 0.848 -1.311 1.595
Runner_FB team_defense_rushing_yards 0.6070 0.584 1.040 0.298 -0.537 1.751
Runner_FB Down 0.2236 0.420 0.533 0.594 -0.599 1.046
Runner_FB Distance 0.1988 0.496 0.401 0.689 -0.774 1.172
Runner_FB PossessionScore 0.0931 0.775 0.120 0.904 -1.427 1.613
Runner_FB DefenseScore -0.0869 0.760 -0.114 0.909 -1.577 1.403
Runner_FB OL -0.3383 0.583 -0.580 0.562 -1.482 0.805
Runner_FB TE -0.5097 0.992 -0.514 0.607 -2.454 1.435
Runner_FB WR -0.1114 1.209 -0.092 0.927 -2.480 2.258
Runner_FB DL -0.1238 0.847 -0.146 0.884 -1.784 1.537
Runner_FB LB -0.3521 0.900 -0.391 0.696 -2.117 1.412
Runner_FB DefendersInTheBox 0.1180 0.693 0.170 0.865 -1.241 1.477
Runner_FB PlayerWeight -0.4358 0.990 -0.440 0.660 -2.375 1.504
Runner_FB Age_days 0.2931 1.090 0.269 0.788 -1.843 2.429
Runner_FB Humidity -0.0370 0.596 -0.062 0.950 -1.205 1.131
Runner_FB Temperature 0.4703 0.515 0.912 0.362 -0.540 1.481
Runner_HB Runner_QB 4.166e-14 1.5e-14 2.778 0.005 1.23e-14 7.1e-14
Runner_HB Outdoors 0.3932 0.825 0.477 0.633 -1.223 2.009
Runner_HB Retractable_open -1.2406 1.687 -0.735 0.462 -4.548 2.067
Runner_HB rainy_weather 0.0313 1.284 0.024 0.981 -2.485 2.547
Runner_HB ypg -0.3823 1.377 -0.278 0.781 -3.082 2.318
Runner_HB YardLine_refactor -0.0491 0.250 -0.196 0.844 -0.539 0.441
Runner_HB TimeElapsed 0.2483 0.441 0.563 0.574 -0.617 1.113
Runner_HB team_rushing_yards 0.3999 2.832 0.141 0.888 -5.151 5.951
Runner_HB team_defense_rushing_yards -0.1093 0.236 -0.463 0.643 -0.572 0.353
Runner_HB Down -0.3384 0.263 -1.287 0.198 -0.854 0.177
Runner_HB Distance -0.2031 0.282 -0.720 0.471 -0.756 0.349
Runner_HB PossessionScore -0.2992 0.401 -0.745 0.456 -1.086 0.488
Runner_HB DefenseScore -0.1552 0.285 -0.545 0.586 -0.713 0.403
Runner_HB OL -0.2498 0.392 -0.637 0.524 -1.019 0.519
Runner_HB TE -0.3683 0.622 -0.592 0.554 -1.588 0.851
Runner_HB WR -0.8387 0.755 -1.110 0.267 -2.320 0.642
Runner_HB DL 0.4642 0.422 1.101 0.271 -0.362 1.291
Runner_HB LB 0.3000 0.431 0.695 0.487 -0.546 1.146
Runner_HB DefendersInTheBox -0.5914 0.353 -1.673 0.094 -1.284 0.101
Runner_HB PlayerWeight 0.2899 2.152 0.135 0.893 -3.928 4.508
Runner_HB Age_days 0.2335 0.533 0.438 0.661 -0.811 1.278
Runner_HB Humidity 0.2708 0.318 0.851 0.395 -0.353 0.894
Runner_HB Temperature -0.3863 0.241 -1.606 0.108 -0.858 0.085
Runner_QB Outdoors 3.2299 1.922 1.681 0.093 -0.537 6.997
Runner_QB Retractable_open 0.2300 5.271 0.044 0.965 -10.102 10.562
Runner_QB rainy_weather -4.4237 3.532 -1.253 0.210 -11.346 2.498
Runner_QB ypg 15.6924 3.008 5.216 0.000 9.796 21.589
Runner_QB YardLine_refactor -0.3345 0.766 -0.437 0.662 -1.836 1.167
Runner_QB TimeElapsed 0.4447 1.437 0.310 0.757 -2.371 3.260
Runner_QB team_rushing_yards -0.5355 0.848 -0.632 0.528 -2.197 1.126
Runner_QB team_defense_rushing_yards -0.0754 0.798 -0.094 0.925 -1.640 1.489
Runner_QB Down -0.0342 0.739 -0.046 0.963 -1.482 1.414
Runner_QB Distance -1.0120 1.004 -1.008 0.314 -2.981 0.957
Runner_QB PossessionScore -0.7606 1.273 -0.598 0.550 -3.256 1.735
Runner_QB DefenseScore -0.4203 1.165 -0.361 0.718 -2.703 1.863
Runner_QB OL -0.2010 1.782 -0.113 0.910 -3.694 3.292
Runner_QB TE -0.7875 1.817 -0.433 0.665 -4.349 2.774
Runner_QB WR -0.5556 2.038 -0.273 0.785 -4.549 3.438
Runner_QB DL -1.1628 1.647 -0.706 0.480 -4.391 2.065
Runner_QB LB -1.4769 1.637 -0.902 0.367 -4.686 1.733
Runner_QB DefendersInTheBox 0.6240 0.972 0.642 0.521 -1.281 2.528
Runner_QB PlayerWeight -0.2524 0.859 -0.294 0.769 -1.935 1.430
Runner_QB Age_days 0.1521 0.680 0.224 0.823 -1.181 1.485
Runner_QB Humidity 0.4063 0.899 0.452 0.651 -1.355 2.167
Runner_QB Temperature 0.3790 0.840 0.451 0.652 -1.268 2.026
Outdoors Retractable_open 2.867e-14 2.8e-14 1.024 0.306 -2.62e-14 8.36e-14
Outdoors rainy_weather -0.2203 0.516 -0.427 0.670 -1.232 0.792
Outdoors ypg 0.0504 0.107 0.472 0.637 -0.159 0.260
Outdoors YardLine_refactor -0.0981 0.095 -1.030 0.303 -0.285 0.088
Outdoors TimeElapsed -0.1102 0.162 -0.678 0.498 -0.429 0.208
Outdoors team_rushing_yards -0.0968 0.104 -0.933 0.351 -0.300 0.107
Outdoors team_defense_rushing_yards -0.0415 0.100 -0.414 0.679 -0.238 0.155
Outdoors Down -0.1389 0.106 -1.305 0.192 -0.347 0.070
Outdoors Distance -0.0312 0.112 -0.279 0.780 -0.251 0.188
Outdoors PossessionScore 0.1931 0.140 1.383 0.167 -0.081 0.467
Outdoors DefenseScore 0.1088 0.123 0.887 0.375 -0.132 0.349
Outdoors OL 0.1587 0.114 1.389 0.165 -0.065 0.383
Outdoors TE -0.0380 0.167 -0.227 0.820 -0.366 0.290
Outdoors WR 0.0076 0.211 0.036 0.971 -0.405 0.420
Outdoors DL -0.0593 0.187 -0.316 0.752 -0.427 0.308
Outdoors LB -0.0612 0.181 -0.337 0.736 -0.417 0.294
Outdoors DefendersInTheBox 0.0654 0.136 0.481 0.631 -0.201 0.332
Outdoors PlayerWeight -0.0263 0.106 -0.248 0.804 -0.234 0.181
Outdoors Age_days 0.0230 0.096 0.240 0.810 -0.165 0.210
Outdoors Humidity -0.0329 0.093 -0.353 0.724 -0.216 0.150
Outdoors Temperature 0.1086 0.107 1.010 0.312 -0.102 0.319
Retractable_open rainy_weather -1.3501 1.418 -0.952 0.341 -4.130 1.429
Retractable_open ypg -0.3262 0.200 -1.630 0.103 -0.718 0.066
Retractable_open YardLine_refactor -0.0837 0.185 -0.453 0.651 -0.446 0.278
Retractable_open TimeElapsed -0.3883 0.319 -1.216 0.224 -1.014 0.237
Retractable_open team_rushing_yards -0.2629 0.274 -0.960 0.337 -0.800 0.274
Retractable_open team_defense_rushing_yards -0.2934 0.181 -1.624 0.104 -0.647 0.061
Retractable_open Down -0.0111 0.208 -0.053 0.958 -0.418 0.396
Retractable_open Distance 0.0272 0.217 0.125 0.901 -0.399 0.453
Retractable_open PossessionScore 0.2322 0.266 0.873 0.383 -0.289 0.753
Retractable_open DefenseScore 0.0809 0.245 0.331 0.741 -0.399 0.561
Retractable_open OL 0.0537 0.199 0.269 0.788 -0.337 0.444
Retractable_open TE -0.2897 0.300 -0.966 0.334 -0.878 0.298
Retractable_open WR 0.3421 0.396 0.864 0.388 -0.434 1.118
Retractable_open DL 0.1539 0.369 0.417 0.677 -0.570 0.878
Retractable_open LB 0.4676 0.360 1.300 0.193 -0.237 1.173
Retractable_open DefendersInTheBox 0.5852 0.265 2.206 0.027 0.065 1.105
Retractable_open PlayerWeight 0.3303 0.217 1.525 0.127 -0.094 0.755
Retractable_open Age_days -0.1128 0.183 -0.615 0.538 -0.472 0.246
Retractable_open Humidity -0.0886 0.217 -0.408 0.683 -0.514 0.337
Retractable_open Temperature 0.0330 0.224 0.147 0.883 -0.406 0.473
rainy_weather ypg -0.0174 0.190 -0.091 0.927 -0.390 0.355
rainy_weather YardLine_refactor 0.0836 0.167 0.501 0.616 -0.243 0.411
rainy_weather TimeElapsed 0.1200 0.268 0.447 0.655 -0.406 0.646
rainy_weather team_rushing_yards -0.4061 0.174 -2.331 0.020 -0.748 -0.065
rainy_weather team_defense_rushing_yards -0.1902 0.159 -1.193 0.233 -0.502 0.122
rainy_weather Down -0.3278 0.179 -1.833 0.067 -0.678 0.023
rainy_weather Distance 0.0028 0.194 0.014 0.989 -0.377 0.383
rainy_weather PossessionScore 0.0339 0.263 0.129 0.898 -0.482 0.550
rainy_weather DefenseScore -0.3040 0.221 -1.376 0.169 -0.737 0.129
rainy_weather OL -0.0525 0.192 -0.273 0.785 -0.430 0.325
rainy_weather TE -0.3994 0.293 -1.365 0.172 -0.973 0.174
rainy_weather WR -0.4565 0.376 -1.213 0.225 -1.194 0.281
rainy_weather DL -0.2003 0.314 -0.638 0.523 -0.816 0.415
rainy_weather LB -0.2492 0.311 -0.801 0.423 -0.859 0.361
rainy_weather DefendersInTheBox 0.0862 0.233 0.370 0.712 -0.371 0.543
rainy_weather PlayerWeight -0.1290 0.174 -0.743 0.457 -0.469 0.211
rainy_weather Age_days -0.0413 0.166 -0.250 0.803 -0.366 0.283
rainy_weather Humidity 0.0540 0.436 0.124 0.901 -0.801 0.909
rainy_weather Temperature -0.0760 0.189 -0.403 0.687 -0.446 0.294
ypg YardLine_refactor -0.1330 0.044 -3.008 0.003 -0.220 -0.046
ypg TimeElapsed -0.0103 0.075 -0.137 0.891 -0.157 0.137
ypg team_rushing_yards -0.0311 0.050 -0.622 0.534 -0.129 0.067
ypg team_defense_rushing_yards 0.0202 0.045 0.453 0.651 -0.067 0.108
ypg Down 0.0494 0.048 1.034 0.301 -0.044 0.143
ypg Distance 0.0015 0.052 0.029 0.977 -0.100 0.103
ypg PossessionScore -0.0299 0.063 -0.473 0.636 -0.154 0.094
ypg DefenseScore 0.0146 0.055 0.265 0.791 -0.093 0.123
ypg OL 0.0960 0.055 1.754 0.079 -0.011 0.203
ypg TE 0.1063 0.078 1.364 0.173 -0.046 0.259
ypg WR 0.2044 0.098 2.091 0.037 0.013 0.396
ypg DL 0.0205 0.082 0.250 0.803 -0.141 0.182
ypg LB -0.0783 0.082 -0.957 0.339 -0.239 0.082
ypg DefendersInTheBox 0.0102 0.063 0.162 0.871 -0.113 0.134
ypg PlayerWeight -0.0499 0.050 -1.002 0.316 -0.148 0.048
ypg Age_days 0.0181 0.054 0.333 0.739 -0.088 0.125
ypg Humidity 0.0222 0.049 0.454 0.650 -0.074 0.118
ypg Temperature 0.0123 0.046 0.265 0.791 -0.078 0.103
YardLine_refactor TimeElapsed -0.0161 0.068 -0.236 0.814 -0.149 0.117
YardLine_refactor team_rushing_yards 0.0145 0.042 0.345 0.730 -0.068 0.097
YardLine_refactor team_defense_rushing_yards 0.0594 0.040 1.499 0.134 -0.018 0.137
YardLine_refactor Down 0.0982 0.045 2.189 0.029 0.010 0.186
YardLine_refactor Distance 0.2971 0.044 6.805 0.000 0.212 0.383
YardLine_refactor PossessionScore -0.0165 0.058 -0.285 0.775 -0.130 0.097
YardLine_refactor DefenseScore -0.0382 0.052 -0.735 0.462 -0.140 0.064
YardLine_refactor OL -0.0322 0.050 -0.651 0.515 -0.129 0.065
YardLine_refactor TE -0.0708 0.071 -0.993 0.320 -0.210 0.069
YardLine_refactor WR 0.0908 0.088 1.033 0.302 -0.082 0.263
YardLine_refactor DL 0.0951 0.077 1.230 0.219 -0.056 0.247
YardLine_refactor LB 0.1232 0.077 1.605 0.108 -0.027 0.274
YardLine_refactor DefendersInTheBox -0.0126 0.057 -0.221 0.825 -0.125 0.099
YardLine_refactor PlayerWeight -0.0567 0.043 -1.316 0.188 -0.141 0.028
YardLine_refactor Age_days -0.0066 0.040 -0.162 0.871 -0.086 0.073
YardLine_refactor Humidity 0.0548 0.044 1.235 0.217 -0.032 0.142
YardLine_refactor Temperature 0.0281 0.041 0.688 0.492 -0.052 0.108
TimeElapsed team_rushing_yards 0.0434 0.070 0.622 0.534 -0.093 0.180
TimeElapsed team_defense_rushing_yards -0.0858 0.068 -1.258 0.209 -0.220 0.048
TimeElapsed Down 0.0389 0.073 0.530 0.596 -0.105 0.183
TimeElapsed Distance -0.0346 0.081 -0.428 0.669 -0.193 0.124
TimeElapsed PossessionScore -0.1334 0.063 -2.105 0.035 -0.258 -0.009
TimeElapsed DefenseScore 0.0903 0.061 1.488 0.137 -0.029 0.209
TimeElapsed OL 0.0887 0.079 1.121 0.262 -0.066 0.244
TimeElapsed TE 0.1721 0.118 1.461 0.144 -0.059 0.403
TimeElapsed WR 0.0053 0.147 0.036 0.971 -0.282 0.293
TimeElapsed DL -0.2206 0.128 -1.724 0.085 -0.471 0.030
TimeElapsed LB -0.0275 0.127 -0.216 0.829 -0.277 0.222
TimeElapsed DefendersInTheBox -0.1388 0.088 -1.579 0.114 -0.311 0.033
TimeElapsed PlayerWeight 0.1398 0.073 1.915 0.055 -0.003 0.283
TimeElapsed Age_days -0.0784 0.067 -1.173 0.241 -0.209 0.053
TimeElapsed Humidity 0.1073 0.075 1.438 0.151 -0.039 0.254
TimeElapsed Temperature 0.0221 0.066 0.337 0.736 -0.106 0.150
team_rushing_yards team_defense_rushing_yards 0.0325 0.041 0.794 0.427 -0.048 0.113
team_rushing_yards Down 0.0060 0.046 0.129 0.898 -0.085 0.097
team_rushing_yards Distance 0.0686 0.049 1.412 0.158 -0.027 0.164
team_rushing_yards PossessionScore -0.0045 0.059 -0.077 0.939 -0.121 0.112
team_rushing_yards DefenseScore 0.0367 0.052 0.711 0.477 -0.064 0.138
team_rushing_yards OL -0.0893 0.057 -1.579 0.114 -0.200 0.022
team_rushing_yards TE -0.1098 0.073 -1.514 0.130 -0.252 0.032
team_rushing_yards WR -0.1366 0.090 -1.516 0.130 -0.313 0.040
team_rushing_yards DL 0.0910 0.080 1.144 0.253 -0.065 0.247
team_rushing_yards LB 0.0518 0.079 0.653 0.514 -0.104 0.208
team_rushing_yards DefendersInTheBox -0.0180 0.060 -0.299 0.765 -0.136 0.100
team_rushing_yards PlayerWeight 0.0350 0.048 0.734 0.463 -0.058 0.128
team_rushing_yards Age_days 0.0115 0.043 0.268 0.788 -0.073 0.096
team_rushing_yards Humidity 0.0690 0.048 1.447 0.148 -0.024 0.162
team_rushing_yards Temperature -0.0477 0.043 -1.104 0.270 -0.132 0.037
team_defense_rushing_yards Down 0.0352 0.044 0.796 0.426 -0.052 0.122
team_defense_rushing_yards Distance 0.0281 0.047 0.594 0.552 -0.065 0.121
team_defense_rushing_yards PossessionScore 0.0315 0.059 0.532 0.595 -0.085 0.148
team_defense_rushing_yards DefenseScore 0.0616 0.052 1.185 0.236 -0.040 0.163
team_defense_rushing_yards OL -0.0706 0.049 -1.448 0.148 -0.166 0.025
team_defense_rushing_yards TE 0.0754 0.071 1.058 0.290 -0.064 0.215
team_defense_rushing_yards WR 0.1325 0.088 1.502 0.133 -0.040 0.305
team_defense_rushing_yards DL 0.0152 0.076 0.199 0.842 -0.134 0.164
team_defense_rushing_yards LB -0.0412 0.074 -0.559 0.576 -0.186 0.103
team_defense_rushing_yards DefendersInTheBox 0.0938 0.056 1.688 0.091 -0.015 0.203
team_defense_rushing_yards PlayerWeight -0.0241 0.045 -0.540 0.589 -0.112 0.063
team_defense_rushing_yards Age_days 0.0123 0.041 0.300 0.764 -0.068 0.092
team_defense_rushing_yards Humidity -0.0113 0.045 -0.250 0.803 -0.100 0.078
team_defense_rushing_yards Temperature 0.0028 0.042 0.065 0.948 -0.080 0.086
Down Distance -0.0079 0.042 -0.185 0.853 -0.091 0.075
Down PossessionScore -0.0266 0.063 -0.422 0.673 -0.150 0.097
Down DefenseScore 0.0016 0.054 0.029 0.977 -0.105 0.108
Down OL 0.0951 0.056 1.683 0.092 -0.016 0.206
Down TE 0.0462 0.088 0.524 0.601 -0.127 0.219
Down WR 0.1641 0.103 1.595 0.111 -0.038 0.366
Down DL 0.1436 0.072 1.986 0.047 0.002 0.285
Down LB 0.1539 0.075 2.042 0.041 0.006 0.302
Down DefendersInTheBox -0.1435 0.058 -2.493 0.013 -0.256 -0.031
Down PlayerWeight -0.0397 0.046 -0.855 0.392 -0.131 0.051
Down Age_days 0.0291 0.047 0.620 0.535 -0.063 0.121
Down Humidity 0.0511 0.049 1.042 0.297 -0.045 0.147
Down Temperature -0.0879 0.045 -1.949 0.051 -0.176 0.000
Distance PossessionScore 0.0038 0.070 0.054 0.957 -0.133 0.141
Distance DefenseScore 0.0202 0.059 0.344 0.731 -0.095 0.135
Distance OL 0.0472 0.069 0.687 0.492 -0.087 0.182
Distance TE 0.0130 0.099 0.132 0.895 -0.180 0.206
Distance WR 0.1259 0.116 1.088 0.277 -0.101 0.353
Distance DL 0.1108 0.081 1.372 0.170 -0.047 0.269
Distance LB 0.0718 0.084 0.856 0.392 -0.093 0.236
Distance DefendersInTheBox -0.0382 0.059 -0.652 0.514 -0.153 0.077
Distance PlayerWeight -0.0642 0.051 -1.249 0.211 -0.165 0.037
Distance Age_days 0.0571 0.050 1.152 0.249 -0.040 0.154
Distance Humidity 0.0509 0.052 0.984 0.325 -0.050 0.152
Distance Temperature 0.0188 0.048 0.388 0.698 -0.076 0.114
PossessionScore DefenseScore -0.1268 0.058 -2.197 0.028 -0.240 -0.014
PossessionScore OL -0.0418 0.073 -0.573 0.567 -0.185 0.101
PossessionScore TE -0.0891 0.102 -0.869 0.385 -0.290 0.112
PossessionScore WR 0.0041 0.124 0.033 0.973 -0.239 0.247
PossessionScore DL 0.1191 0.110 1.084 0.279 -0.096 0.334
PossessionScore LB 0.0615 0.109 0.561 0.575 -0.153 0.276
PossessionScore DefendersInTheBox -0.0382 0.080 -0.481 0.631 -0.194 0.118
PossessionScore PlayerWeight 0.0204 0.061 0.332 0.740 -0.100 0.141
PossessionScore Age_days 0.0458 0.061 0.751 0.453 -0.074 0.165
PossessionScore Humidity -0.1202 0.064 -1.867 0.062 -0.246 0.006
PossessionScore Temperature 0.0052 0.059 0.089 0.929 -0.110 0.121
DefenseScore OL -0.0154 0.064 -0.242 0.809 -0.140 0.109
DefenseScore TE 0.0578 0.093 0.624 0.533 -0.124 0.239
DefenseScore WR 0.0544 0.115 0.473 0.636 -0.171 0.280
DefenseScore DL 0.1001 0.094 1.065 0.287 -0.084 0.284
DefenseScore LB -0.0485 0.094 -0.515 0.606 -0.233 0.136
DefenseScore DefendersInTheBox 0.0757 0.069 1.092 0.275 -0.060 0.212
DefenseScore PlayerWeight -0.0983 0.055 -1.774 0.076 -0.207 0.010
DefenseScore Age_days -0.0321 0.052 -0.621 0.535 -0.133 0.069
DefenseScore Humidity 0.0121 0.058 0.209 0.834 -0.101 0.125
DefenseScore Temperature 0.0971 0.053 1.847 0.065 -0.006 0.200
OL TE 0.0212 0.053 0.396 0.692 -0.084 0.126
OL WR 0.0765 0.082 0.935 0.350 -0.084 0.237
OL DL -0.0660 0.091 -0.726 0.468 -0.244 0.112
OL LB 0.0101 0.088 0.114 0.909 -0.163 0.183
OL DefendersInTheBox 0.0604 0.067 0.899 0.368 -0.071 0.192
OL PlayerWeight 0.1317 0.052 2.514 0.012 0.029 0.234
OL Age_days 0.0494 0.050 0.990 0.322 -0.048 0.147
OL Humidity 0.0097 0.057 0.170 0.865 -0.102 0.121
OL Temperature 0.0060 0.048 0.124 0.901 -0.088 0.100
TE WR -0.0105 0.060 -0.176 0.860 -0.127 0.106
TE DL 0.0493 0.133 0.372 0.710 -0.211 0.309
TE LB 0.0245 0.128 0.191 0.849 -0.227 0.276
TE DefendersInTheBox -0.0841 0.098 -0.860 0.390 -0.276 0.107
TE PlayerWeight 0.2193 0.068 3.208 0.001 0.085 0.353
TE Age_days 0.0180 0.071 0.253 0.800 -0.121 0.157
TE Humidity 0.0172 0.076 0.226 0.821 -0.132 0.166
TE Temperature -0.0355 0.071 -0.499 0.618 -0.175 0.104
WR DL 0.0976 0.144 0.679 0.497 -0.184 0.379
WR LB 0.1260 0.141 0.896 0.371 -0.150 0.402
WR DefendersInTheBox -0.1402 0.109 -1.282 0.200 -0.354 0.074
WR PlayerWeight 0.1423 0.089 1.592 0.111 -0.033 0.317
WR Age_days 0.0814 0.092 0.889 0.374 -0.098 0.261
WR Humidity -0.0190 0.097 -0.197 0.844 -0.209 0.171
WR Temperature -0.0416 0.091 -0.459 0.646 -0.219 0.136
DL LB -3.219e-06 0.051 -6.3e-05 1.000 -0.100 0.100
DL DefendersInTheBox 0.0727 0.072 1.014 0.311 -0.068 0.213
DL PlayerWeight 0.0480 0.080 0.598 0.550 -0.109 0.206
DL Age_days -0.0442 0.077 -0.573 0.567 -0.196 0.107
DL Humidity -0.0092 0.087 -0.106 0.916 -0.180 0.162
DL Temperature -0.0884 0.077 -1.152 0.249 -0.239 0.062
LB DefendersInTheBox 0.0632 0.077 0.824 0.410 -0.087 0.214
LB PlayerWeight 0.0785 0.079 0.999 0.318 -0.076 0.233
LB Age_days -0.0392 0.077 -0.505 0.613 -0.191 0.113
LB Humidity -0.0470 0.085 -0.554 0.579 -0.213 0.119
LB Temperature -0.0529 0.078 -0.675 0.500 -0.207 0.101
DefendersInTheBox PlayerWeight -0.0939 0.061 -1.542 0.123 -0.213 0.025
DefendersInTheBox Age_days 0.0518 0.056 0.918 0.359 -0.059 0.162
DefendersInTheBox Humidity -0.0930 0.064 -1.459 0.145 -0.218 0.032
DefendersInTheBox Temperature 0.0640 0.058 1.106 0.269 -0.049 0.178
PlayerWeight Age_days -0.0133 0.045 -0.298 0.766 -0.101 0.074
PlayerWeight Humidity 0.0380 0.048 0.796 0.426 -0.056 0.131
PlayerWeight Temperature -0.0678 0.043 -1.570 0.116 -0.152 0.017
Age_days Humidity -0.0059 0.046 -0.127 0.899 -0.097 0.085
Age_days Temperature 0.0050 0.038 0.130 0.897 -0.070 0.080
Humidity Temperature 0.0418 0.059 0.705 0.481 -0.074 0.158
Omnibus: 28310.718 Durbin-Watson: 2.004
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1638509.991
Skew: 4.259 Prob(JB): 0.00
Kurtosis: 37.579 Cond. No. 9.14e+16


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.69e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [70]:
interaction_summary = pd.DataFrame((model.summary().tables[1]))
columns = list(interaction_summary.iloc[0].astype(str))
interaction_summary.columns = columns
interaction_summary = interaction_summary[1:]
interaction_summary.index = list(interaction_summary.iloc[0:,0].astype(str))
interaction_summary = interaction_summary.drop(columns=[''])
interaction_summary = interaction_summary.rename(columns={'P>|t|': "p"})
interaction_summary = interaction_summary.astype(str).astype(float)
In [71]:
interaction_summary
Out[71]:
coef std err t p [0.025 0.975]
const 4.1946 0.179 23.413 0.000 3.843 4.546
Team_Home 0.0200 0.093 0.214 0.830 -0.163 0.203
Shotgun -0.0665 0.118 -0.566 0.572 -0.297 0.164
Wildcat 1.1095 1.612 0.688 0.491 -2.050 4.269
PlayDirection_Left 0.1721 0.089 1.927 0.054 -0.003 0.347
... ... ... ... ... ... ...
PlayerWeight Humidity 0.0380 0.048 0.796 0.426 -0.056 0.131
PlayerWeight Temperature -0.0678 0.043 -1.570 0.116 -0.152 0.017
Age_days Humidity -0.0059 0.046 -0.127 0.899 -0.097 0.085
Age_days Temperature 0.0050 0.038 0.130 0.897 -0.070 0.080
Humidity Temperature 0.0418 0.059 0.705 0.481 -0.074 0.158

496 rows × 6 columns

In [72]:
interaction_summary.sort_values(by=['p']).head(29)
Out[72]:
coef std err t p [0.025 0.975]
const 4.194600e+00 1.790000e-01 23.413 0.000 3.843000e+00 4.546000e+00
Runner_WR ypg 1.253230e+01 9.880000e-01 12.685 0.000 1.059600e+01 1.446900e+01
YardLine_refactor Distance 2.971000e-01 4.400000e-02 6.805 0.000 2.120000e-01 3.830000e-01
const DefendersInTheBox -2.631000e-01 7.500000e-02 -3.530 0.000 -4.090000e-01 -1.170000e-01
const YardLine_refactor -1.812000e-01 5.100000e-02 -3.534 0.000 -2.820000e-01 -8.100000e-02
const ypg 2.206000e-01 5.700000e-02 3.860 0.000 1.090000e-01 3.330000e-01
const Runner_QB 1.501470e+01 3.499000e+00 4.292 0.000 8.157000e+00 2.187200e+01
const Runner_WR 1.222550e+01 9.620000e-01 12.710 0.000 1.034000e+01 1.411100e+01
DefendersInTheBox -2.631000e-01 7.500000e-02 -3.530 0.000 -4.090000e-01 -1.170000e-01
YardLine_refactor -1.812000e-01 5.100000e-02 -3.534 0.000 -2.820000e-01 -8.100000e-02
ypg 2.206000e-01 5.700000e-02 3.860 0.000 1.090000e-01 3.330000e-01
Runner_QB ypg 1.569240e+01 3.008000e+00 5.216 0.000 9.796000e+00 2.158900e+01
Runner_WR 1.222550e+01 9.620000e-01 12.710 0.000 1.034000e+01 1.411100e+01
Runner_QB 1.501470e+01 3.499000e+00 4.292 0.000 8.157000e+00 2.187200e+01
TE PlayerWeight 2.193000e-01 6.800000e-02 3.208 0.001 8.500000e-02 3.530000e-01
Runner_WR WR -1.564000e+00 4.970000e-01 -3.147 0.002 -2.538000e+00 -5.900000e-01
ypg YardLine_refactor -1.330000e-01 4.400000e-02 -3.008 0.003 -2.200000e-01 -4.600000e-02
Runner_WR team_rushing_yards -7.680000e-01 2.670000e-01 -2.875 0.004 -1.291000e+00 -2.440000e-01
Runner_WR YardLine_refactor -8.028000e-01 2.770000e-01 -2.896 0.004 -1.346000e+00 -2.600000e-01
Runner_HB Runner_QB 4.166000e-14 1.500000e-14 2.778 0.005 1.230000e-14 7.100000e-14
Runner_WR Distance -1.023900e+00 3.940000e-01 -2.598 0.009 -1.796000e+00 -2.510000e-01
OL PlayerWeight 1.317000e-01 5.200000e-02 2.514 0.012 2.900000e-02 2.340000e-01
Down DefendersInTheBox -1.435000e-01 5.800000e-02 -2.493 0.013 -2.560000e-01 -3.100000e-02
Runner_WR Down -8.586000e-01 3.440000e-01 -2.496 0.013 -1.533000e+00 -1.840000e-01
Wildcat ypg 3.963100e+00 1.620000e+00 2.447 0.014 7.880000e-01 7.138000e+00
PlayDirection_Left Runner_HB 1.078600e+00 4.500000e-01 2.396 0.017 1.960000e-01 1.961000e+00
Wildcat Runner_QB 1.587730e+01 6.784000e+00 2.340 0.019 2.580000e+00 2.917500e+01
rainy_weather team_rushing_yards -4.061000e-01 1.740000e-01 -2.331 0.020 -7.480000e-01 -6.500000e-02
Runner_WR TE -1.033900e+00 4.610000e-01 -2.241 0.025 -1.938000e+00 -1.290000e-01
In [73]:
model.pvalues[model.pvalues < 0.05]
Out[73]:
const                                 3.609842e-120
Runner_WR                              6.449280e-37
Runner_QB                              1.779633e-05
ypg                                    1.135633e-04
YardLine_refactor                      4.101695e-04
DefendersInTheBox                      4.168597e-04
const Runner_WR                        6.449280e-37
const Runner_QB                        1.779633e-05
const ypg                              1.135633e-04
const YardLine_refactor                4.101695e-04
const DefendersInTheBox                4.168597e-04
Team_Home Shotgun                      2.843517e-02
Shotgun PlayDirection_Left             3.279190e-02
Wildcat Runner_QB                      1.927637e-02
Wildcat ypg                            1.441754e-02
Wildcat Down                           4.519529e-02
PlayDirection_Left Runner_HB           1.657705e-02
PlayDirection_Left Outdoors            4.510656e-02
Runner_WR ypg                          8.879133e-37
Runner_WR YardLine_refactor            3.776579e-03
Runner_WR team_rushing_yards           4.037502e-03
Runner_WR Down                         1.255747e-02
Runner_WR Distance                     9.389660e-03
Runner_WR TE                           2.505924e-02
Runner_WR WR                           1.648629e-03
Runner_HB Runner_QB                    5.465292e-03
Runner_QB ypg                          1.839269e-07
Retractable_open DefendersInTheBox     2.741149e-02
rainy_weather team_rushing_yards       1.976285e-02
ypg YardLine_refactor                  2.630996e-03
ypg WR                                 3.654479e-02
YardLine_refactor Down                 2.861573e-02
YardLine_refactor Distance             1.028508e-11
TimeElapsed PossessionScore            3.528011e-02
Down DL                                4.707488e-02
Down LB                                4.115274e-02
Down DefendersInTheBox                 1.268643e-02
PossessionScore DefenseScore           2.803914e-02
OL PlayerWeight                        1.193671e-02
TE PlayerWeight                        1.339491e-03
dtype: float64

In this analysis of interaction (synergy) effects from the predictors many were found to have interacting effects. Some of these include the 'Wildcat' formation interacting with the predictor for if the QB is the runner, the runners ypg and the down. Other interacting variables are the Runner_WR (indicating the runner is a WR) with ypg, yard line, down, distance, number of TE's and WR's in the formation and yards per game. Another interaction is PossessionScore (offense score) and DefenseScore.

Regression Models

In [100]:
regression_models = []
regression_train_MSE = []
regression_test_MSE = []

Null Model

The null model is defined as the average rushing yards over the entire data set.

In [75]:
y_train['Yards'].mean()
Out[75]:
4.25783702429585
In [101]:
null_train_predictions = [y_train['Yards'].mean()] * len(X_train.index)
null_test_predictions = [y_train['Yards'].mean()] * len(X_test.index)
train_MSE = mean_squared_error(y_train,null_train_predictions)
test_MSE = mean_squared_error(y_test, null_test_predictions)
regression_models.append('Null')
regression_train_MSE.append(train_MSE)
regression_test_MSE.append(test_MSE)
In [79]:
train_MSE
Out[79]:
42.77075937227789
In [80]:
test_MSE
Out[80]:
38.09438939451306

The Null Model gives a baseline test MSE of 38.09.

Decision Trees

Full Regression Tree

In [154]:
regression_tree_clf = DecisionTreeRegressor(max_leaf_nodes=10)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train)
In [155]:
dot_data = tree.export_graphviz(regression_tree_clf, out_file=None,
                               feature_names=X_train.columns,
                               filled=True, rounded=True,
                               special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[155]:
Tree 0 YardLine_refactor ≤ 1.552 mse = 42.771 samples = 23255 value = 4.258 1 DefendersInTheBox ≤ 0.585 mse = 45.797 samples = 21090 value = 4.492 0->1 True 2 YardLine_refactor ≤ 1.908 mse = 7.557 samples = 2165 value = 1.978 0->2 False 3 ypg ≤ -2.051 mse = 44.968 samples = 16396 value = 4.692 1->3 4 mse = 48.065 samples = 4694 value = 3.792 1->4 5 mse = 15.039 samples = 149 value = 0.812 3->5 6 Runner_WR ≤ 0.5 mse = 45.103 samples = 16247 value = 4.728 3->6 7 DefendersInTheBox ≤ -1.403 mse = 43.773 samples = 15802 value = 4.648 6->7 8 ypg ≤ -1.393 mse = 84.131 samples = 445 value = 7.551 6->8 13 mse = 60.511 samples = 844 value = 6.117 7->13 14 mse = 42.7 samples = 14958 value = 4.565 7->14 9 mse = 63.14 samples = 430 value = 6.995 8->9 10 TE ≤ -1.428 mse = 423.716 samples = 15 value = 23.467 8->10 15 mse = 0.0 samples = 1 value = 65.0 10->15 16 team_rushing_yards ≤ -0.702 mse = 321.964 samples = 14 value = 20.5 10->16 17 mse = 10.889 samples = 6 value = 4.333 16->17 18 mse = 212.234 samples = 8 value = 32.625 16->18 11 mse = 9.047 samples = 1412 value = 2.701 2->11 12 mse = 1.948 samples = 753 value = 0.623 2->12

The Full Decision tree shows that the three most important predictors for the rushing yards regression decision tree are field position (YardLine_refactor), defenders in the box and the rushers yards per game. A the field position goes up (which means that the team is getting closer to their oponents goal line, ie moving in the positive direction up the field), the number of yards a running play will be expected to gain goes down. This could be because as they get closer to the goal line there are just less yards left available to gain as well as the defenses may change strategy with less room to give. It also shows that the more defenders in the box, the less rushing yards can be expected. The 'box' is a rectangle that is the width of the offensive line and about 5 yards deep. These are essentially the number of defenders close to the ball at the snap. Yards per game is a measurement of the rushers total yards gained consistently over every game they run the ball in. This means that more consistent rushers have a higher ypg and according to this decision tree are more likely to gain more yards.

In [104]:
null_train_predictions = train_predictions = regression_tree_clf.predict(X_train)
null_test_predictions = test_predictions = regression_tree_clf.predict(X_test)
train_MSE = mean_squared_error(y_train,null_train_predictions)
test_MSE = mean_squared_error(y_test, null_test_predictions)
In [105]:
train_MSE
Out[105]:
41.315466079574364
In [106]:
test_MSE
Out[106]:
37.572015304779896
In [107]:
regression_models.append('Decision_Tree')
regression_train_MSE.append(train_MSE)
regression_test_MSE.append(test_MSE)
In [108]:
regression_models_df = pd.DataFrame({'regression_models': regression_models,'regression_train_MSE': regression_train_MSE, 'regression_test_MSE':regression_test_MSE})
regression_models_df
Out[108]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015

Tree Pruning

First the tree will be pruned by running cross validation on each tree size and determining the best model out of all the created tree sizes.

In [156]:
path = regression_tree_clf.cost_complexity_pruning_path(X_train,y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
In [157]:
ccp_alphas
Out[157]:
array([0.        , 0.08273443, 0.09121541, 0.09874274, 0.1376068 ,
       0.53343076])
In [158]:
tree_size = range(2,11)
In [174]:
def run_cross_validation_on_regression_trees(X, y, tree_size, cv=5, scoring='neg_mean_squared_error'):
    cv_scores_list = []
    cv_scores_std = []
    cv_scores_mean = []
    accuracy_scores = []
    for size in tree_size:
        tree_model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=size)
        cv_scores = cross_val_score(tree_model, X, y, cv=cv, scoring=scoring)
        cv_scores_list.append(cv_scores)
        cv_scores_mean.append(cv_scores.mean())
        cv_scores_std.append(cv_scores.std())
        accuracy_scores.append(mean_squared_error(y, tree_model.fit(X, y).predict(X)))
    cv_scores_mean = - np.array(cv_scores_mean)
    cv_scores_std = np.array(cv_scores_std)
    accuracy_scores = np.array(accuracy_scores)
    return cv_scores_mean, cv_scores_std, accuracy_scores
In [180]:
def plot_cross_validation_on_regression_trees(tree_size, cv_scores_mean, cv_scores_std, accuracy_scores, title):
    fig, ax = plt.subplots(1,1, figsize=(15,5))
    ax.plot(tree_size, cv_scores_mean, '-o', label='mean cross-validation accuracy', alpha=0.9)
#     ax.fill_between(ccp_alphas, cv_scores_mean-2*cv_scores_std, cv_scores_mean+2*cv_scores_std, alpha=0.2)
#     ylim = plt.ylim()
    ax.plot(tree_size, accuracy_scores, '-*', label='train accuracy', alpha=0.9)
    ax.set_title(title, fontsize=16)
    ax.set_xlabel('Terminal Nodes', fontsize=14)
    ax.set_ylabel('MSE', fontsize=14)
#     ax.set_ylim(ylim)
#     ax.set_xticks(ccp_alphas)
    ax.legend()
In [181]:
sm_cv_scores_mean
Out[181]:
array([42.25353049, 42.185192  , 42.12407724, 42.09483534, 42.24199159,
       42.1402264 , 42.04656875, 42.02843547, 42.05061348])
In [182]:
sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores = run_cross_validation_on_regression_trees(X_train, y_train, tree_size)
In [184]:
plot_cross_validation_on_regression_trees(tree_size, sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores, 'MSE vs. Terminal Nodes on Cross-Validation training data')

From the graph above, it appears that a tree with 5 terminal nodes has the lowest error and is simpler than the full tree determined earlier. This model with 5 Terminal Nodes will be used moving forward for the pruned tree.

In [255]:
regression_tree_clf = DecisionTreeRegressor(max_leaf_nodes=5)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train)
In [204]:
dot_data = tree.export_graphviz(regression_tree_clf, out_file=None,
                               feature_names=X_train.columns,
                               filled=True, rounded=True,
                               special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[204]:
Tree 0 YardLine_refactor ≤ 1.552 mse = 42.771 samples = 23255 value = 4.258 1 DefendersInTheBox ≤ 0.585 mse = 45.797 samples = 21090 value = 4.492 0->1 True 2 mse = 7.557 samples = 2165 value = 1.978 0->2 False 3 ypg ≤ -2.051 mse = 44.968 samples = 16396 value = 4.692 1->3 4 mse = 48.065 samples = 4694 value = 3.792 1->4 5 mse = 15.039 samples = 149 value = 0.812 3->5 6 Runner_WR ≤ 0.5 mse = 45.103 samples = 16247 value = 4.728 3->6 7 mse = 43.773 samples = 15802 value = 4.648 6->7 8 mse = 84.131 samples = 445 value = 7.551 6->8
In [256]:
train_predictions = regression_tree_clf.predict(X_train)
test_predictions = regression_tree_clf.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [257]:
train_MSE
Out[257]:
41.85600002775729
In [258]:
test_MSE
Out[258]:
37.42151181362354
In [193]:
regression_models.append('Decision_Tree')
regression_train_MSE.append(train_MSE)
regression_test_MSE.append(test_MSE)
In [259]:
regression_models_df = regression_models_df.append({'regression_models':'Pruned_Tree', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
In [260]:
regression_models_df
Out[260]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512

From these two tree models some interesting information about the predictors can be visualized. First, clearly field position (YardLine_refactor) is a key component to determining rushing yards, followed by the number of defenders in the box. These two, along with the runners yards per game and the binary predictor for if the runner is a wide receiver round out the pruned decision tree. These are 4 variables that should be kept track of as further models are analyzed

Cost Complexity Tree Pruning

Now the Tree will be pruned based on cost complexity pruning, using cross validation to obtain an alpha (tuning parameter)value on the training set.

In [1133]:
regression_tree_clf = DecisionTreeRegressor(max_leaf_nodes=10)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train_regression)
In [1134]:
path = regression_tree_clf.cost_complexity_pruning_path(X_train, y_train_regression)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
In [1135]:
ccp_alphas
Out[1135]:
array([0.        , 0.08273443, 0.09121541, 0.09874274, 0.1376068 ,
       0.53343076])
In [1136]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
Out[1136]:
Text(0.5, 1.0, 'Total Impurity vs effective alpha for training set')
In [1137]:
alphas = np.linspace(0,max(ccp_alphas[:-1]),20)
In [1138]:
alphas
Out[1138]:
array([0.        , 0.00724246, 0.01448493, 0.02172739, 0.02896985,
       0.03621232, 0.04345478, 0.05069724, 0.05793971, 0.06518217,
       0.07242463, 0.0796671 , 0.08690956, 0.09415202, 0.10139448,
       0.10863695, 0.11587941, 0.12312187, 0.13036434, 0.1376068 ])
In [1143]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for alpha in alphas:
    tree_model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=10,ccp_alpha=alpha)
    cv_scores = cross_val_score(tree_model, X_train, y_train_regression.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(-cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_MSE_list.append(mean_squared_error(y_train_regression,tree_model.fit(X_train, y_train_regression.values.ravel()).predict(X_train)))
In [1144]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Accuracy vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter alpha', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[1144]:
<matplotlib.legend.Legend at 0x1e98912ab48>
In [1152]:
alphas[8]
Out[1152]:
0.05793970559631957

From the Cross validation on Tuning parameter, a tuning parameter of 0.058 will be used for the Cost Complexity Pruning Regression Tree Model

In [1158]:
regression_tree_clf = DecisionTreeRegressor(random_state=0,ccp_alpha=0.058, max_leaf_nodes=10)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train_regression)
In [1159]:
train_predictions = regression_tree_clf.predict(X_train)
test_predictions = regression_tree_clf.predict(X_test)
train_MSE = mean_squared_error(y_train_regression,train_predictions)
test_MSE = mean_squared_error(y_test_regression, test_predictions)
In [1160]:
train_MSE
Out[1160]:
41.315466079574364
In [1161]:
test_MSE
Out[1161]:
37.572015304779896
In [ ]:
 
In [1164]:
dot_data = tree.export_graphviz(regression_tree_clf, out_file=None,
                               feature_names=X_train.columns,
                               filled=True, rounded=True,
                               special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[1164]:
Tree 0 YardLine_refactor ≤ 1.552 mse = 42.771 samples = 23255 value = 4.258 1 DefendersInTheBox ≤ 0.585 mse = 45.797 samples = 21090 value = 4.492 0->1 True 16 YardLine_refactor ≤ 1.908 mse = 7.557 samples = 2165 value = 1.978 0->16 False 2 ypg ≤ -2.051 mse = 44.968 samples = 16396 value = 4.692 1->2 15 mse = 48.065 samples = 4694 value = 3.792 1->15 3 mse = 15.039 samples = 149 value = 0.812 2->3 4 Runner_WR ≤ 0.5 mse = 45.103 samples = 16247 value = 4.728 2->4 5 DefendersInTheBox ≤ -1.403 mse = 43.773 samples = 15802 value = 4.648 4->5 8 ypg ≤ -1.393 mse = 84.131 samples = 445 value = 7.551 4->8 6 mse = 60.511 samples = 844 value = 6.117 5->6 7 mse = 42.7 samples = 14958 value = 4.565 5->7 9 mse = 63.14 samples = 430 value = 6.995 8->9 10 WR ≤ 1.519 mse = 423.716 samples = 15 value = 23.467 8->10 11 team_rushing_yards ≤ -0.702 mse = 321.964 samples = 14 value = 20.5 10->11 14 mse = 0.0 samples = 1 value = 65.0 10->14 12 mse = 10.889 samples = 6 value = 4.333 11->12 13 mse = 212.234 samples = 8 value = 32.625 11->13 17 mse = 9.047 samples = 1412 value = 2.701 16->17 18 mse = 1.948 samples = 753 value = 0.623 16->18
In [1168]:
backup = copy.deepcopy(regression_models_df)
In [1169]:
data = pd.DataFrame({'regression_models':'Cost_Complexity_Pruned_Tree', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},index=[2.5])
In [1170]:
regression_models_df = regression_models_df.append(data, ignore_index=False)
regression_models_df = regression_models_df.sort_index().reset_index(drop=True)
regression_models_df
Out[1170]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Cost_Complexity_Pruned_Tree 41.315466 37.572015
4 Bagging 41.048588 37.278613
5 Random Forest 41.105248 37.169823
6 Boosting 41.109360 37.085639
7 Multi_Linear_Regression 41.724492 37.179962
8 Forward_Stepwise_Subset 41.731454 37.199699
9 Backward_Stepwise_Subset 41.731454 37.199699
10 Ridge_Regression 41.728287 37.176765
11 Lasso_Regression 41.740725 37.167263
12 Partial_Least_Squares 41.735556 37.207023

The Cost complexity Pruning approach ended up obtaining the same results as the full tree for this data set.

Bagging

In [589]:
clf_bagging = RandomForestRegressor(max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = len(X_train.columns))
In [590]:
clf_bagging = clf_bagging.fit(X_train,y_train.values.ravel())
In [591]:
oob_predictions = clf_bagging.oob_prediction_
oob_MSE = mean_squared_error(y_train,oob_predictions)
In [592]:
oob_MSE
Out[592]:
41.630068964009595
In [593]:
train_predictions = clf_bagging.predict(X_train)
test_predictions = clf_bagging.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [594]:
train_MSE
Out[594]:
41.04858752732196
In [595]:
test_MSE
Out[595]:
37.27861251847982

Using a Bagging approach gives an OOB error on the training data of 41.04 but a test MSE of 37.28, which is slightly less than the pruned decision tree.

In [596]:
backup = copy.deepcopy(regression_models_df)
In [597]:
regression_models_df = regression_models_df.append({'regression_models':'Bagging', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
Out[597]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
In [272]:
clf_bagging.feature_importances_
Out[272]:
array([0.00048166, 0.00400473, 0.        , 0.00316039, 0.07742692,
       0.        , 0.00107826, 0.00234828, 0.0019371 , 0.        ,
       0.00187719, 0.19435958, 0.3910749 , 0.02385226, 0.0329065 ,
       0.01378748, 0.00165208, 0.01209658, 0.01446677, 0.00555896,
       0.        , 0.02326459, 0.01754258, 0.00248425, 0.0005573 ,
       0.10583813, 0.02479653, 0.02206828, 0.00945614, 0.01192256])
In [273]:
tree_importance_sorted_idx = np.argsort(clf_bagging.feature_importances_)
tree_indices = np.arange(0, len(clf_bagging.feature_importances_)) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_bagging.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_bagging.feature_importances_)))
ax1.set_title('Bagging Variable Importance')
Out[273]:
Text(0.5, 1.0, 'Bagging Variable Importance')

The Bagging predictor importance chart shows similar predictor information to the previously calculated trees. Field Position (YardLine_refactor), rushers yards per game, defenders in the box, and if the rusher is a wide receiver are the most important predictors according to this bagging model.

Random Forest

In [275]:
len(X_train.columns)
Out[275]:
30
In [277]:
np.sqrt(len(X_train.columns))
Out[277]:
5.477225575051661
In [297]:
num_predictors = range(1,len(X_train.columns)+1)
In [299]:
oob_MSE_list = []
train_MSE_list = []

for num in num_predictors:
    clf_rf = RandomForestRegressor(max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = num)
    clf_rf = clf_rf.fit(X_train,y_train.values.ravel())

    oob_predictions = clf_rf.oob_prediction_
    oob_MSE = mean_squared_error(y_train,oob_predictions)
    oob_MSE_list.append(oob_MSE)
    
    train_predictions = clf_rf.predict(X_train)
    train_MSE = mean_squared_error(y_train,train_predictions)
    train_MSE_list.append(train_MSE)
    
In [301]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(num_predictors, oob_MSE_list, '-o', label='OOB MSE', alpha=0.9)
ax.plot(num_predictors, train_MSE_list, '-*', label='train MSE', alpha=0.9)
ax.set_title('Random Forest: MSE vs. Number of Predictors', fontsize=16)
ax.set_xlabel('Number of Predictors', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[301]:
<matplotlib.legend.Legend at 0x1e986d3c808>

From the plot above with MSE vs. Number of Predictors, it appears the OOB MSE appears to level off around 18 predictors.

In [323]:
number_trees = np.arange(25,525,25)
number_trees = np.insert(number_trees, 0,1)
In [317]:
oob_MSE_list = []
train_MSE_list = []

for num in number_trees:
    clf_rf = RandomForestRegressor(n_estimators=num, max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = 18)
    clf_rf = clf_rf.fit(X_train,y_train.values.ravel())

    oob_predictions = clf_rf.oob_prediction_
    oob_MSE = mean_squared_error(y_train,oob_predictions)
    oob_MSE_list.append(oob_MSE)
    
    train_predictions = clf_rf.predict(X_train)
    train_MSE = mean_squared_error(y_train,train_predictions)
    train_MSE_list.append(train_MSE)
    print(num)
    
d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\ensemble\_forest.py:832: UserWarning:

Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.

In [319]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees, oob_MSE_list, '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(number_trees, train_MSE_list, '-*', label='m=18 train MSE', alpha=0.9)
ax.set_title('Random Forest: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[319]:
<matplotlib.legend.Legend at 0x1e9869e4a48>
In [322]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees[1:], oob_MSE_list[1:], '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(number_trees[1:], train_MSE_list[1:], '-*', label='m=18 train MSE', alpha=0.9)
ax.set_title('Random Forest: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[322]:
<matplotlib.legend.Legend at 0x1e986942d48>

From this plot it appears that the MSE levels off around 200 Trees. A Random Forest with 18 predictors and 200 Trees will be used as the final Random Forest Model.

In [598]:
clf_rf = RandomForestRegressor(n_estimators=200, max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = 18)
In [599]:
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
In [600]:
oob_predictions = clf_rf.oob_prediction_
oob_MSE = mean_squared_error(y_train,oob_predictions)
In [601]:
oob_MSE
Out[601]:
41.64717986819891
In [602]:
train_predictions = clf_rf.predict(X_train)
test_predictions = clf_rf.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [603]:
train_MSE
Out[603]:
41.10524839655702
In [604]:
test_MSE
Out[604]:
37.169823144543216

Using a Random Forest approach with 18 predictors and 200 Trees gives an OOB error on the training data of 41.64 but a test MSE of 37.17, which is slightly less than the Bagging Model MSE.

In [605]:
backup = copy.deepcopy(regression_models_df)
In [606]:
regression_models_df = regression_models_df.append({'regression_models':'Random Forest', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
Out[606]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
In [335]:
tree_importance_sorted_idx = np.argsort(clf_rf.feature_importances_)
tree_indices = np.arange(0, len(clf_rf.feature_importances_)) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_rf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_bagging.feature_importances_)))
ax1.set_title('Random Forest Variable Importance')
Out[335]:
Text(0.5, 1.0, 'Random Forest Variable Importance')

Similarly to the Bagging Variable importance, field position, rusher yards per game and defenders in the box are the most important variables to predicting rushing yards on a given play. All of these top 3 variables do appear to be slightly less important in the Random Forest model compared to the the Bagging model though.

Boosting

In [344]:
# Determine number of trees using cross validation
number_trees = np.arange(250,5250,250)
number_trees = np.arange(500,5500,500)
number_trees = np.arange(25,525,25)
In [ ]:
clf_boosting3
In [435]:
number_trees = np.arange(50,250,50)
In [427]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
# train_MSE_list = []
for size in number_trees:
    clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4, 
                                     random_state=0, 
                                  n_estimators=size)
    cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(-cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
#     train_MSE_list.append(mean_squared_error(y_train, clf_boosting.fit(X_train, y_train.values.ravel()).predict(X_train)))
    print(size)
50
100
150
200
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-427-1b06ff08b97e> in <module>
      7                                      random_state=0,
      8                                   n_estimators=size)
----> 9     cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
     10     cv_scores_list.append(cv_scores)
     11     cv_scores_mean.append(-cv_scores.mean())

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
    404                                 fit_params=fit_params,
    405                                 pre_dispatch=pre_dispatch,
--> 406                                 error_score=error_score)
    407     return cv_results['test_score']
    408 

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    246             return_times=True, return_estimator=return_estimator,
    247             error_score=error_score)
--> 248         for train, test in cv.split(X, y, groups))
    249 
    250     zipped_scores = list(zip(*scores))

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1005                 self._iterating = self._original_iterator is not None
   1006 
-> 1007             while self.dispatch_one_batch(iterator):
   1008                 pass
   1009 

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    833                 return False
    834             else:
--> 835                 self._dispatch(tasks)
    836                 return True
    837 

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    752         with self._lock:
    753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
    755             # A job can complete so quickly than its callback is
    756             # called before we get here, causing self._jobs to

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    207     def apply_async(self, func, callback=None):
    208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
    210         if callback:
    211             callback(result)

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    588         # Don't delay the application, to avoid keeping the input
    589         # arguments in memory
--> 590         self.results = batch()
    591 
    592     def get(self):

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    529             estimator.fit(X_train, **fit_params)
    530         else:
--> 531             estimator.fit(X_train, y_train, **fit_params)
    532 
    533     except Exception as e:

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\ensemble\_gb.py in fit(self, X, y, sample_weight, monitor)
    498         n_stages = self._fit_stages(
    499             X, y, raw_predictions, sample_weight, self._rng, X_val, y_val,
--> 500             sample_weight_val, begin_at_stage, monitor, X_idx_sorted)
    501 
    502         # change shape of arrays after fit (early-stopping or additional ests)

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\ensemble\_gb.py in _fit_stages(self, X, y, raw_predictions, sample_weight, random_state, X_val, y_val, sample_weight_val, begin_at_stage, monitor, X_idx_sorted)
    555             raw_predictions = self._fit_stage(
    556                 i, X, y, raw_predictions, sample_weight, sample_mask,
--> 557                 random_state, X_idx_sorted, X_csc, X_csr)
    558 
    559             # track deviance (= loss)

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\ensemble\_gb.py in _fit_stage(self, i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc, X_csr)
    210             X = X_csr if X_csr is not None else X
    211             tree.fit(X, residual, sample_weight=sample_weight,
--> 212                      check_input=False, X_idx_sorted=X_idx_sorted)
    213 
    214             # update tree leaves

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
   1244             sample_weight=sample_weight,
   1245             check_input=check_input,
-> 1246             X_idx_sorted=X_idx_sorted)
   1247         return self
   1248 

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    373                                            min_impurity_split)
    374 
--> 375         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    376 
    377         if self.n_outputs_ == 1 and is_classifier(self):

KeyboardInterrupt: 
In [469]:
cv_scores_mean
Out[469]:
[41.66584416506114, 41.9609556857251, 42.18247213184469, 42.41711654023884]
In [436]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
# ax.plot(number_trees, oob_MSE_list, '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(number_trees, cv_scores_mean, '-*', label='Boosting CV MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[436]:
<matplotlib.legend.Legend at 0x1e9875b8a88>

Even at relatively low number of trees (from 50 to 200 trees) the CV MSE is increasing for Boosting. With this information we will look to tune to lwoer numbers of trees.

In [449]:
cv_scores_list2 = []
cv_scores_std2 = []
cv_scores_mean2 = []
train_MSE_list = []
for size in number_trees:
    clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4, 
                                     random_state=0, 
                                  n_estimators=size)
    cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list2.append(cv_scores)
    cv_scores_mean2.append(-cv_scores.mean())
    cv_scores_std2.append(cv_scores.std())
    train_MSE_list.append(mean_squared_error(y_train, clf_boosting.fit(X_train, y_train.values.ravel()).predict(X_train)))
    print(size)
10
15
20
25
30
35
40
45
In [475]:
combined_num_trees = list(np.arange(10,50,5)) + list(np.arange(50,250,50))
combined_cv_scores =  list(cv_scores_mean2)  + list(cv_scores_mean)
In [476]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
# ax.plot(number_trees, oob_MSE_list, '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(combined_num_trees, combined_cv_scores, '-*', label='Boosting CV MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[476]:
<matplotlib.legend.Legend at 0x1e98b944788>
In [452]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees, cv_scores_mean2, '-o', label='CV MSE', alpha=0.9)
ax.plot(number_trees, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[452]:
<matplotlib.legend.Legend at 0x1e987b0ffc8>

Looking at this plot of MSE vs. Number of Trees for Boosting (with a shrinkage parameter of 0.1), it appears 25 Trees is a good amount for boosting based on the cross validation MSE. It's interesting to note that the over-fitting due to a higher number of trees is prevalent, shown by the decreasing Train MSE with increasing CV MSE, which isn't the case for bagging and random forests.

In [477]:
lamda = np.arange(0.001,0.1,(0.1-0.001)/20).tolist()
lamda.append(0.1)
In [478]:
cv_scores_list_lamda = []
cv_scores_std_lamda = []
cv_scores_mean_lamda = []
train_MSE_list = []

for learning_rate in lamda:
    clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4, 
                                     random_state=0, 
                                  n_estimators=25, learning_rate=learning_rate)
    cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list_lamda.append(cv_scores)
    cv_scores_mean_lamda.append(-cv_scores.mean())
    cv_scores_std_lamda.append(cv_scores.std())
    train_MSE_list.append(mean_squared_error(y_train, clf_boosting.fit(X_train, y_train.values.ravel()).predict(X_train)))
    print(learning_rate)
0.001
0.00595
0.0109
0.015850000000000003
0.020800000000000003
0.025750000000000002
0.030700000000000005
0.03565
0.040600000000000004
0.04555000000000001
0.0505
0.055450000000000006
0.06040000000000001
0.06535
0.0703
0.07525000000000001
0.08020000000000001
0.08515
0.09010000000000001
0.09505000000000001
0.1
In [480]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(lamda, cv_scores_mean_lamda, '-o', label='CV MSE', alpha=0.9)
ax.plot(lamda, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Shrinkage Parameter Lambda', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[480]:
<matplotlib.legend.Legend at 0x1e98c0f8b48>

This plot of MSE vs. shrinkage parameter shows that a value of lambda=0.07 is where CV MSE is minimized for the Boosting model. A Boosting Model with 30 trees and a shrinkage parameter of 0.07 will be used in the final Boosting Model.

In [619]:
clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4, 
                                     random_state=0, 
                                  n_estimators=25, learning_rate=0.07)
In [620]:
clf_boosting = clf_boosting.fit(X_train,y_train.values.ravel())
In [621]:
train_predictions = clf_boosting.predict(X_train)
test_predictions = clf_boosting.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [622]:
train_MSE
Out[622]:
41.10936022480303
In [623]:
test_MSE
Out[623]:
37.085639117738346

Using a Boostingg approach with 25 trees and a shrinkage parameter of 0.07 gives a training MSE of 41.1 and a test MSE of 37.09. This values is slightly lower than the values found for Bagging and Random Forest.

In [624]:
backup = copy.deepcopy(regression_models_df)
In [625]:
regression_models_df = regression_models_df.append({'regression_models':'Boosting', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
Out[625]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
5 Boosting 41.109360 37.085639
In [529]:
tree_importance_sorted_idx = np.argsort(clf_boosting.feature_importances_)
tree_indices = np.arange(0, len(clf_boosting.feature_importances_)) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_boosting.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_boosting.feature_importances_)))
Out[529]:
(0, 30)

The variable importance chart for Boosting shows more variables are insignificant to the response. Field postition, rushers yards per game and defenders in the box have the most importance to the response.

Linear Regression

In [626]:
clf_linear = LinearRegression()
clf_linear = clf_linear.fit(X_train,y_train)
In [627]:
train_predictions = clf_linear.predict(X_train)
test_predictions = clf_linear.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [628]:
train_MSE
Out[628]:
41.724491650511105
In [629]:
test_MSE
Out[629]:
37.17996169210267
In [630]:
regression_models_df = regression_models_df.append({'regression_models':'Multi_Linear_Regression', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
Out[630]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
5 Boosting 41.109360 37.085639
6 Multi_Linear_Regression 41.724492 37.179962

Subset Selection

Forward Subset Selection

In [536]:
def fit_linear_reg(X,Y):
    #Fit linear regression model and return RSS and R squared values
    model_k = LinearRegression(fit_intercept = True)
    model_k.fit(X,Y)
    RSS = mean_squared_error(Y,model_k.predict(X)) * len(Y)
    R_squared = model_k.score(X,Y)
    return RSS, R_squared
In [545]:
remaining_features = list(X_train.columns.values)
features = []
RSS_list, R_squared_list = [np.inf], [np.inf] #Due to 1 indexing of the loop...
features_list = dict()
In [546]:
k = 30
In [547]:
for i in range(1,k+1):
    best_RSS = np.inf
    
    for combo in itertools.combinations(remaining_features,1):

            RSS = fit_linear_reg(X_train[list(combo) + features],y_train)   #Store temp result 

            if RSS[0] < best_RSS:
                best_RSS = RSS[0]
                best_R_squared = RSS[1] 
                best_feature = combo[0]

    #Updating variables for next loop
    features.append(best_feature)
    remaining_features.remove(best_feature)
    
    #Saving values for plotting
    RSS_list.append(best_RSS)
    R_squared_list.append(best_R_squared)
    features_list[i] = features.copy()
In [550]:
df1 = pd.concat([pd.DataFrame({'features':features_list}),pd.DataFrame({'RSS':RSS_list, 'R_squared': R_squared_list})], axis=1, join='inner')
df1['numb_features'] = df1.index
In [552]:
# Compute adjusted R_squared
In [561]:
m = len(y_train)
p = 31
hat_sigma_squared = (1/(m - p -1)) * min(df1['RSS'])
df1['R_squared_adj'] = 1 - ( (1 - df1['R_squared'])*(m-1)/(m-df1['numb_features'] -1))
In [572]:
df1.sort_values('R_squared_adj',ascending=False)
Out[572]:
features RSS R_squared numb_features R_squared_adj
17 [DefendersInTheBox, YardLine_refactor, Runner_... 970464.967935 0.024299 17 0.023586
18 [DefendersInTheBox, YardLine_refactor, Runner_... 970426.215674 0.024338 18 0.023583
19 [DefendersInTheBox, YardLine_refactor, Runner_... 970399.107674 0.024366 19 0.023568
16 [DefendersInTheBox, YardLine_refactor, Runner_... 970538.618823 0.024225 16 0.023554
20 [DefendersInTheBox, YardLine_refactor, Runner_... 970376.294292 0.024389 20 0.023549
21 [DefendersInTheBox, YardLine_refactor, Runner_... 970355.594151 0.024409 21 0.023528
15 [DefendersInTheBox, YardLine_refactor, Runner_... 970618.708986 0.024145 15 0.023515
22 [DefendersInTheBox, YardLine_refactor, Runner_... 970339.240933 0.024426 22 0.023502
23 [DefendersInTheBox, YardLine_refactor, Runner_... 970323.546839 0.024442 23 0.023476
14 [DefendersInTheBox, YardLine_refactor, Runner_... 970706.833951 0.024056 14 0.023468
24 [DefendersInTheBox, YardLine_refactor, Runner_... 970315.006576 0.024450 24 0.023442
13 [DefendersInTheBox, YardLine_refactor, Runner_... 970802.633138 0.023960 13 0.023414
25 [DefendersInTheBox, YardLine_refactor, Runner_... 970311.639667 0.024454 25 0.023404
26 [DefendersInTheBox, YardLine_refactor, Runner_... 970308.696606 0.024457 26 0.023365
12 [DefendersInTheBox, YardLine_refactor, Runner_... 970928.703002 0.023833 12 0.023329
27 [DefendersInTheBox, YardLine_refactor, Runner_... 970306.338212 0.024459 27 0.023325
28 [DefendersInTheBox, YardLine_refactor, Runner_... 970304.155305 0.024461 28 0.023285
29 [DefendersInTheBox, YardLine_refactor, Runner_... 970303.568734 0.024462 29 0.023244
30 [DefendersInTheBox, YardLine_refactor, Runner_... 970303.053333 0.024462 30 0.023202
11 [DefendersInTheBox, YardLine_refactor, Runner_... 971186.645573 0.023574 11 0.023112
10 [DefendersInTheBox, YardLine_refactor, Runner_... 971353.891776 0.023406 10 0.022986
9 [DefendersInTheBox, YardLine_refactor, Runner_... 971686.986482 0.023071 9 0.022693
8 [DefendersInTheBox, YardLine_refactor, Runner_... 971850.886985 0.022906 8 0.022570
7 [DefendersInTheBox, YardLine_refactor, Runner_... 972056.201951 0.022700 7 0.022405
6 [DefendersInTheBox, YardLine_refactor, Runner_... 972427.114734 0.022327 6 0.022074
5 [DefendersInTheBox, YardLine_refactor, Runner_... 972927.576602 0.021824 5 0.021613
4 [DefendersInTheBox, YardLine_refactor, Runner_... 973928.796999 0.020817 4 0.020648
3 [DefendersInTheBox, YardLine_refactor, Runner_WR] 975940.156810 0.018795 3 0.018668
2 [DefendersInTheBox, YardLine_refactor] 978193.358212 0.016529 2 0.016445
1 [DefendersInTheBox] 984015.011623 0.010676 1 0.010634

From the forward stepwise selection, 17 variables were selected based on the adjusted R^2 value for that model.

In [730]:
best_subset = df1.loc[17]['features']
In [731]:
best_subset
Out[731]:
['DefendersInTheBox',
 'YardLine_refactor',
 'Runner_WR',
 'ypg',
 'team_rushing_yards',
 'Distance',
 'Retractable_open',
 'PossessionScore',
 'LB',
 'DL',
 'WR',
 'TE',
 'PlayerWeight',
 'Team_Home',
 'rainy_weather',
 'Temperature',
 'team_defense_rushing_yards']
In [631]:
clf_linear = LinearRegression()
clf_linear = clf_linear.fit(X_train[best_subset],y_train)
In [632]:
train_predictions = clf_linear.predict(X_train[best_subset])
test_predictions = clf_linear.predict(X_test[best_subset])
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [633]:
train_MSE
Out[633]:
41.73145422208352
In [634]:
test_MSE
Out[634]:
37.19969860833816

The forward subset selection gave a training MSE of 41.7 and a test MSE of 37.2. Surprisingly, this was a slightly higher test MSE than the test MSE for the Linear Regression with all predictors. This could be due to the training and test split or the fact that the forward stepwise subset selection was determined based on adjusted r^2.

In [635]:
backup = copy.deepcopy(regression_models_df)
In [636]:
regression_models_df = regression_models_df.append({'regression_models':'Forward_Stepwise_Subset', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
In [637]:
regression_models_df
Out[637]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
5 Boosting 41.109360 37.085639
6 Multi_Linear_Regression 41.724492 37.179962
7 Forward_Stepwise_Subset 41.731454 37.199699

Backward Subset

In [741]:
remaining_features = list(X_train.columns.values)
features = []
RSS_list, R_squared_list = [], [] #Due to 1 indexing of the loop...
features_list = dict()
num_features = []
In [721]:
for j in range(1,k+1):
# for i in range(k,0,-1):
    i = 31-j
    print(i)
    best_RSS = np.inf
    
    for combo in itertools.combinations(remaining_features,i):

            RSS = fit_linear_reg(X_train[list(combo)],y_train)   #Store temp result 

            if RSS[0] < best_RSS:
                best_RSS = RSS[0]
                best_R_squared = RSS[1] 
                best_combo = combo

    #Updating variables for next loop
    remaining_features = best_combo
    
    #Saving values for plotting
    RSS_list.append(best_RSS)
    R_squared_list.append(best_R_squared)
    features_list[j-1] = best_combo
    num_features.append(i)
30
29
28
27
26
25
24
23
22
21
20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
In [722]:
df_back = pd.concat([pd.DataFrame({'features':features_list}),pd.DataFrame({'numb_features':num_features, 'RSS':RSS_list, 'R_squared': R_squared_list})], axis=1, join='inner')
In [723]:
df_back
Out[723]:
features numb_features RSS R_squared
0 (Team_Home, Shotgun, Wildcat, PlayDirection_Le... 30 970303.053333 0.024462
1 (Team_Home, Wildcat, PlayDirection_Left, Runne... 29 970303.568734 0.024462
2 (Team_Home, Wildcat, PlayDirection_Left, Runne... 28 970304.155305 0.024461
3 (Team_Home, Wildcat, PlayDirection_Left, Runne... 27 970306.338212 0.024459
4 (Team_Home, Wildcat, PlayDirection_Left, Runne... 26 970308.696606 0.024457
5 (Team_Home, Wildcat, PlayDirection_Left, Runne... 25 970311.639667 0.024454
6 (Team_Home, PlayDirection_Left, Runner_WR, Run... 24 970315.006576 0.024450
7 (Team_Home, PlayDirection_Left, Runner_WR, Run... 23 970323.546839 0.024442
8 (Team_Home, PlayDirection_Left, Runner_WR, Run... 22 970339.240933 0.024426
9 (Team_Home, PlayDirection_Left, Runner_WR, Run... 21 970355.594151 0.024409
10 (Team_Home, PlayDirection_Left, Runner_WR, Run... 20 970376.294292 0.024389
11 (Team_Home, PlayDirection_Left, Runner_WR, Ret... 19 970399.107674 0.024366
12 (Team_Home, Runner_WR, Retractable_open, rainy... 18 970426.215674 0.024338
13 (Team_Home, Runner_WR, Retractable_open, rainy... 17 970464.967935 0.024299
14 (Team_Home, Runner_WR, Retractable_open, rainy... 16 970538.618823 0.024225
15 (Team_Home, Runner_WR, Retractable_open, rainy... 15 970618.708986 0.024145
16 (Team_Home, Runner_WR, Retractable_open, ypg, ... 14 970706.833951 0.024056
17 (Runner_WR, Retractable_open, ypg, YardLine_re... 13 970802.633138 0.023960
18 (Runner_WR, Retractable_open, ypg, YardLine_re... 12 970928.703002 0.023833
19 (Runner_WR, Retractable_open, ypg, YardLine_re... 11 971138.047397 0.023623
20 (Runner_WR, Retractable_open, ypg, YardLine_re... 10 971384.544194 0.023375
21 (Runner_WR, Retractable_open, ypg, YardLine_re... 9 971560.260172 0.023198
22 (Runner_WR, Retractable_open, ypg, YardLine_re... 8 971886.328918 0.022870
23 (Runner_WR, Retractable_open, ypg, YardLine_re... 7 972056.201951 0.022700
24 (Runner_WR, ypg, YardLine_refactor, team_rushi... 6 972427.114734 0.022327
25 (Runner_WR, ypg, YardLine_refactor, team_rushi... 5 972927.576602 0.021824
26 (Runner_WR, ypg, YardLine_refactor, DefendersI... 4 973928.796999 0.020817
27 (Runner_WR, YardLine_refactor, DefendersInTheBox) 3 975940.156810 0.018795
28 (YardLine_refactor, DefendersInTheBox) 2 978193.358212 0.016529
29 (DefendersInTheBox,) 1 984015.011623 0.010676
In [724]:
m = len(y_train)
p = 31
hat_sigma_squared = (1/(m - p -1)) * min(df_back['RSS'])
df_back['R_squared_adj'] = 1 - ( (1 - df_back['R_squared'])*(m-1)/(m-df_back['numb_features'] -1))
In [725]:
df_back.sort_values('R_squared_adj',ascending=False)
Out[725]:
features numb_features RSS R_squared R_squared_adj
13 (Team_Home, Runner_WR, Retractable_open, rainy... 17 970464.967935 0.024299 0.023586
12 (Team_Home, Runner_WR, Retractable_open, rainy... 18 970426.215674 0.024338 0.023583
11 (Team_Home, PlayDirection_Left, Runner_WR, Ret... 19 970399.107674 0.024366 0.023568
14 (Team_Home, Runner_WR, Retractable_open, rainy... 16 970538.618823 0.024225 0.023554
10 (Team_Home, PlayDirection_Left, Runner_WR, Run... 20 970376.294292 0.024389 0.023549
9 (Team_Home, PlayDirection_Left, Runner_WR, Run... 21 970355.594151 0.024409 0.023528
15 (Team_Home, Runner_WR, Retractable_open, rainy... 15 970618.708986 0.024145 0.023515
8 (Team_Home, PlayDirection_Left, Runner_WR, Run... 22 970339.240933 0.024426 0.023502
7 (Team_Home, PlayDirection_Left, Runner_WR, Run... 23 970323.546839 0.024442 0.023476
16 (Team_Home, Runner_WR, Retractable_open, ypg, ... 14 970706.833951 0.024056 0.023468
6 (Team_Home, PlayDirection_Left, Runner_WR, Run... 24 970315.006576 0.024450 0.023442
17 (Runner_WR, Retractable_open, ypg, YardLine_re... 13 970802.633138 0.023960 0.023414
5 (Team_Home, Wildcat, PlayDirection_Left, Runne... 25 970311.639667 0.024454 0.023404
4 (Team_Home, Wildcat, PlayDirection_Left, Runne... 26 970308.696606 0.024457 0.023365
18 (Runner_WR, Retractable_open, ypg, YardLine_re... 12 970928.703002 0.023833 0.023329
3 (Team_Home, Wildcat, PlayDirection_Left, Runne... 27 970306.338212 0.024459 0.023325
2 (Team_Home, Wildcat, PlayDirection_Left, Runne... 28 970304.155305 0.024461 0.023285
1 (Team_Home, Wildcat, PlayDirection_Left, Runne... 29 970303.568734 0.024462 0.023244
0 (Team_Home, Shotgun, Wildcat, PlayDirection_Le... 30 970303.053333 0.024462 0.023202
19 (Runner_WR, Retractable_open, ypg, YardLine_re... 11 971138.047397 0.023623 0.023161
20 (Runner_WR, Retractable_open, ypg, YardLine_re... 10 971384.544194 0.023375 0.022955
21 (Runner_WR, Retractable_open, ypg, YardLine_re... 9 971560.260172 0.023198 0.022820
22 (Runner_WR, Retractable_open, ypg, YardLine_re... 8 971886.328918 0.022870 0.022534
23 (Runner_WR, Retractable_open, ypg, YardLine_re... 7 972056.201951 0.022700 0.022405
24 (Runner_WR, ypg, YardLine_refactor, team_rushi... 6 972427.114734 0.022327 0.022074
25 (Runner_WR, ypg, YardLine_refactor, team_rushi... 5 972927.576602 0.021824 0.021613
26 (Runner_WR, ypg, YardLine_refactor, DefendersI... 4 973928.796999 0.020817 0.020648
27 (Runner_WR, YardLine_refactor, DefendersInTheBox) 3 975940.156810 0.018795 0.018668
28 (YardLine_refactor, DefendersInTheBox) 2 978193.358212 0.016529 0.016445
29 (DefendersInTheBox,) 1 984015.011623 0.010676 0.010634

From the backward stepwise selection, 17 variables were selected based on the adjusted R^2 value for that model.

In [756]:
best_subset = np.array(df_back.loc[13]['features'])
In [760]:
clf_backward = LinearRegression()
clf_backward = clf_linear.fit(X_train[best_subset],y_train)
In [761]:
train_predictions = clf_backward.predict(X_train[best_subset])
test_predictions = clf_backward.predict(X_test[best_subset])
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [762]:
train_MSE
Out[762]:
41.73145422208352
In [763]:
test_MSE
Out[763]:
37.19969860833816

The backward subset selection gave similar results to the forward subset selection.

In [764]:
backup = copy.deepcopy(regression_models_df)
In [765]:
regression_models_df = regression_models_df.append({'regression_models':'Backward_Stepwise_Subset', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
In [766]:
regression_models_df
Out[766]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
5 Boosting 41.109360 37.085639
6 Multi_Linear_Regression 41.724492 37.179962
7 Forward_Stepwise_Subset 41.731454 37.199699
8 Backward_Stepwise_Subset 41.731454 37.199699

Shrinkage Methods

Ridge Regression

Cross Validation to determine best tuning parameter on training data

In [792]:
reg = RidgeCV(alphas=np.logspace(-6, 6, 13),
                          cv=5)
ridge_model = reg.fit(X_train,y_train)
ridge_model.alpha_
Out[792]:
100.0
In [822]:
lamda_cv = ridge_model.alpha_
lamda_cv
Out[822]:
100.0
In [798]:
alphas=np.logspace(-6, 6, 13)
In [868]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for alpha in alphas:
    clf_ridge = Ridge(alpha=alpha)
    cv_scores = cross_val_score(clf_ridge, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(-cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_MSE_list.append(mean_squared_error(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
In [869]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Ridge Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
Out[869]:
<matplotlib.legend.Legend at 0x1e98ee28fc8>
In [870]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas[:-3], cv_scores_mean[:-3], '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas[:-3], train_MSE_list[:-3], '-*', label='Train MSE', alpha=0.9)
ax.set_title('Ridge Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
Out[870]:
<matplotlib.legend.Legend at 0x1e9904911c8>
In [839]:
cv_scores_mean
Out[839]:
[41.880779885756944,
 41.880779883455276,
 41.88077986043858,
 41.88077963027584,
 41.880777329070426,
 41.88075435913927,
 41.88052880106404,
 41.878627766376795,
 41.873945402154675,
 41.91277147220411,
 42.00732988840531,
 42.4178470288674,
 42.72014555678897]

From cross validation, the tuning parameter is chosen to be 100.

In [831]:
clf_ridge = Ridge(alpha=100)
clf_ridge = clf_ridge.fit(X_train,y_train)
In [832]:
train_predictions = clf_ridge.predict(X_train)
test_predictions = clf_ridge.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [833]:
train_MSE
Out[833]:
41.72828673730419
In [834]:
test_MSE
Out[834]:
37.176765089729976

For a ridge regression model with a tuning parameter of 100, the training MSE was calculated to be 41.73, with a cross validation (on training data) of 41.87. The test MSE was calculated to be 37.177, which is less than all of the other linear regression models thus far.

In [861]:
importance_sorted_idx = np.argsort(clf_ridge.coef_[0])
tree_indices = np.arange(0, len(clf_ridge.coef_[0])) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_ridge.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
Out[861]:
(0, 30)

From this plot you can see certain parameters coefficients are much lower than others, as the Ridge Regression tuning parameter is pushing them to zero. The highest coefficient is if the runner is a wide receiver and then rushers yards per game. The largest negative correlations are field position and defenders in the box.

In [836]:
backup = copy.deepcopy(regression_models_df)
In [837]:
regression_models_df = regression_models_df.append({'regression_models':'Ridge_Regression', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
In [838]:
regression_models_df
Out[838]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
5 Boosting 41.109360 37.085639
6 Multi_Linear_Regression 41.724492 37.179962
7 Forward_Stepwise_Subset 41.731454 37.199699
8 Backward_Stepwise_Subset 41.731454 37.199699
9 Ridge_Regression 41.728287 37.176765

Lasso

In [863]:
reg = LassoCV(alphas=np.logspace(-6, 6, 13),
                          cv=5)
lasso_model = reg.fit(X_train,y_train.values.ravel())
lasso_model.alpha_
Out[863]:
0.01
In [ ]:
 
In [872]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for alpha in alphas:
    clf_lasso = Lasso(alpha=alpha)
    cv_scores = cross_val_score(clf_lasso, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(-cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_MSE_list.append(mean_squared_error(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
In [873]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Lasso Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
Out[873]:
<matplotlib.legend.Legend at 0x1e98ea87288>
In [875]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas[:-3], cv_scores_mean[:-3], '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas[:-3], train_MSE_list[:-3], '-*', label='Train MSE', alpha=0.9)
ax.set_title('Lasso Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
Out[875]:
<matplotlib.legend.Legend at 0x1e98dfb9948>

From cross-validation, the tuning parameter is selected to be 0.01.

In [876]:
clf_lasso = Lasso(alpha=0.01)
clf_lasso = clf_lasso.fit(X_train,y_train)
In [880]:
min(cv_scores_mean)
Out[880]:
41.86244969089374
In [877]:
train_predictions = clf_lasso.predict(X_train)
test_predictions = clf_lasso.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [878]:
train_MSE
Out[878]:
41.74072519755045
In [879]:
test_MSE
Out[879]:
37.167262982433854

For a lasso regression model with a tuning parameter of 0.01, the training MSE was calculated to be 41.74, with a cross validation (on training data) of 41.86. The test MSE was calculated to be 37.167, which is less than all of the other linear regression models thus far, including Ridge Regression.

In [885]:
importance_sorted_idx = np.argsort(clf_lasso.coef_)
tree_indices = np.arange(0, len(clf_lasso.coef_)) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_lasso.coef_[importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_lasso.coef_)))
Out[885]:
(0, 30)

From this plot you can see certain parameters coefficients are pushed to zero by the lasso regression, including Wildcat, OL, defense score, shotgun, down and others. The highest coefficient is if the runner is a wide receiver and then rushers yards per game. The largest negative correlations are field position and defenders in the box.

In [836]:
backup = copy.deepcopy(regression_models_df)
In [886]:
regression_models_df = regression_models_df.append({'regression_models':'Lasso_Regression', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
In [887]:
regression_models_df
Out[887]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Bagging 41.048588 37.278613
4 Random Forest 41.105248 37.169823
5 Boosting 41.109360 37.085639
6 Multi_Linear_Regression 41.724492 37.179962
7 Forward_Stepwise_Subset 41.731454 37.199699
8 Backward_Stepwise_Subset 41.731454 37.199699
9 Ridge_Regression 41.728287 37.176765
10 Lasso_Regression 41.740725 37.167263

Partial Least Squares

In [889]:
pls = PLSRegression(n_components=10)
In [908]:
r2_values = []
In [892]:
regr = LinearRegression()
In [894]:
n = len(X_train)
In [895]:
# Null model (just intercept, no principal components in regression)
score = cross_val_score(regr, np.ones((n,1)), y_train, cv=5,scoring='r2').mean()
r2_values.append(score)
In [ ]:
 
In [911]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for i in np.arange(1,31):
    clf_pls = PLSRegression(n_components=i)
    cv_scores = cross_val_score(clf_pls, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(-cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_MSE_list.append(mean_squared_error(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\cross_decomposition\_pls.py:350: RuntimeWarning:

invalid value encountered in true_divide

In [915]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(np.arange(1,31), cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(np.arange(1,31), train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Partial Least Squares: MSE vs. Principal Components', fontsize=16)
ax.set_xlabel('Number of Principal Components', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Out[915]:
<matplotlib.legend.Legend at 0x1e98d985e08>
In [917]:
cv_scores_mean[3]
Out[917]:
41.88996094979501

From Cross-Validation, It appears 4 Principal Components is sufficient for the Partial Least Squares Analysis.

In [923]:
clf_pls = PLSRegression(n_components=4)
clf_pls = clf_pls.fit(X_train,y_train)
In [924]:
train_predictions = clf_pls.predict(X_train)
test_predictions = clf_pls.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
In [925]:
train_MSE
Out[925]:
41.73555598995453
In [926]:
test_MSE
Out[926]:
37.20702284960455

Using a Partial Least Squares Model with 4 components, the cross-validation (on the training data) MSE was calculated to be 41.9 and the full training MSE was 41.7. The test set MSE was 37.2, which is higher than all the other regression models thus far.

In [927]:
backup = copy.deepcopy(regression_models_df)
In [928]:
regression_models_df = regression_models_df.append({'regression_models':'Partial_Least_Squares', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
In [1364]:
regression_models_df
Out[1364]:
regression_models regression_train_MSE regression_test_MSE
0 Null 42.770759 38.094389
1 Decision_Tree 41.315466 37.572015
2 Pruned_Tree 41.856000 37.421512
3 Cost_Complexity_Pruned_Tree 41.315466 37.572015
4 Bagging 41.048588 37.278613
5 Random Forest 41.105248 37.169823
6 Boosting 41.109360 37.085639
7 Multi_Linear_Regression 41.724492 37.179962
8 Forward_Stepwise_Subset 41.731454 37.199699
9 Backward_Stepwise_Subset 41.731454 37.199699
10 Ridge_Regression 41.728287 37.176765
11 Lasso_Regression 41.740725 37.167263
12 Partial_Least_Squares 41.735556 37.207023

Regression Model Comparison

We have now analyzed 11 different regressions (in addition to the null model) for determining rushing yards on a given NFL play based on 30 predictors. To compare all of these models against one another, the data was split into training and test data. Then each model was created from the training data, with the final model comparisons to be done using the test set Mean Squared Error. Clearly this data and regression is highly complex and tough to represent using linear models. This makes sense based on the diagnostic plots generated. None of the models particularly stand out from the null model, which makes sense based on the complexity and nature of the problem. However, certiain models did achieve better results than others, particularly Boosting. Even though it is a tree-based model, boosting showed the lowest test MSE compared to all of the other models, including ridge, lasso and Partial Least Squares among others. All of the models are better than the null model, which shows that these variables may be a good starting point for future more complex analysis.

Regression Model Conclusion

In this regression analysis we attempted to determine the number of yards a running play would generate based on numerous predictors, including the yards per game of the rusher, the position of the rusher, the offensive and defensive formations and where the ball is located on the field, among others. First, data was processed to get all of this information from a data set provided with information on every running play from 2017-2019. Collinearity was investigated, in which it was determined that several predictors should be taken out.

Not surprisingly after the diagnostics earlier, the models struggled to correctly determine the yards an NFL rushing play would generate based on 30 predictor variables. The data was right-skewed due to the seldom long runs that happen. Perhaps above all else this was a good learning experience in the importance of the diagnostics and assumptions of linear models. However, it did show certain interesting information such as the runner being a wide receiver, distance to go (from first down marker) and being the home team are positively correlated to the rushing yards gained on a play. Also, certain predictors show a negative correlation to rushing yards on a play, including field position, number of defenders in the box, number of Defensive Linemen and Linebackers, if the quarterback is the rusher and most surprisingly if the field is played at a retractable roof stadium with the roof open. Other small correlations were the field being outdorrs and if the play is going to the left show a slight positive orrelation to rushing yards.

Though the accuracy of these models may not be at the point where coaches and owners (or fantasy sports fans and gamblers) would trust the model to determine with great accuracy the amount of yards a rushing play would generate, it does provide certain insight into the factors that may lead to a rushing play gaining more yards.

Classification Problem

In this classification problem, I will use the same data set as above to determine whether a rushing play will gain a first down or not based on the predictor variables.

In [931]:
runners.head()
Out[931]:
GameId PlayId Team X Y S A Dis Orientation Dir NflId DisplayName JerseyNumber Season YardLine Quarter GameClock PossessionTeam Down Distance FieldPosition HomeScoreBeforePlay VisitorScoreBeforePlay NflIdRusher OffenseFormation OffensePersonnel DefendersInTheBox DefensePersonnel PlayDirection TimeHandoff TimeSnap Yards PlayerHeight PlayerWeight PlayerBirthDate PlayerCollegeName Position HomeTeamAbbr VisitorTeamAbbr Week Stadium Location StadiumType Turf GameWeather Temperature Humidity WindSpeed WindDirection sum_yards games ypg YardLine_refactor GameClock_timedata TimeElapsed DefenseTeam team_rushing_yards team_defense_rushing_yards PossessionScore DefenseScore ScoreDifferential OL TE WR RB Shotgun Wildcat DL LB DB PlayDirection_Left PlayerHeight_inches Age_days Runner_RB Runner_WR Runner_FB Runner_HB Runner_QB Outdoors Indoors Retractable_open Natural_Grass rainy_weather average_temperature average_humidity
0 2017090700 20170907000118 home 78.75 30.53 3.63 3.35 0.38 161.98 245.74 2543773 James White 28 2017 35 1 14:14:00 NE 3 2 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:06.000Z 2017-09-08T00:44:05.000Z 8 5-10 205 02/03/1992 Wisconsin RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 716.0 38.0 18.842105 35 00:14:14 46 KC 4820.0 4332.0 0 0 0 5 1 3 1 1 0 2 3 6 1 70.0 9349 1 0 0 0 0 1 0 0 0 0 NaN NaN
1 2017090700 20170907000139 home 71.07 27.16 3.06 2.41 0.34 210.70 312.20 2543773 James White 28 2017 43 1 13:52:00 NE 1 10 NE 0 0 2543773 SHOTGUN 1 RB, 1 TE, 3 WR 6 2 DL, 3 LB, 6 DB left 2017-09-08T00:44:27.000Z 2017-09-08T00:44:26.000Z 3 5-10 205 02/03/1992 Wisconsin RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 716.0 38.0 18.842105 43 00:13:52 68 KC 4820.0 4332.0 0 0 0 5 1 3 1 1 0 2 3 6 1 70.0 9349 1 0 0 0 0 1 0 0 0 0 NaN NaN
2 2017090700 20170907000189 home 48.66 19.11 5.77 2.42 0.60 140.82 221.96 2543773 James White 28 2017 35 1 13:02:00 NE 1 10 KC 0 0 2543773 SINGLEBACK 1 RB, 1 TE, 3 WR 7 2 DL, 3 LB, 6 DB left 2017-09-08T00:45:17.000Z 2017-09-08T00:45:15.000Z 5 5-10 205 02/03/1992 Wisconsin RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 716.0 38.0 18.842105 65 00:13:02 118 KC 4820.0 4332.0 0 0 0 5 1 3 1 0 0 2 3 6 1 70.0 9349 1 0 0 0 0 1 0 0 0 0 NaN NaN
3 2017090700 20170907000345 home 15.53 25.36 4.45 3.20 0.46 186.22 275.44 2539663 Mike Gillislee 35 2017 2 1 12:12:00 NE 2 2 KC 0 0 2539663 JUMBO 6 OL, 2 RB, 2 TE, 0 WR 9 4 DL, 4 LB, 3 DB left 2017-09-08T00:48:41.000Z 2017-09-08T00:48:39.000Z 2 5-11 210 11/01/1990 Florida RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 426.0 13.0 32.769231 98 00:12:12 168 KC 4820.0 4332.0 0 0 0 6 2 0 2 0 0 4 4 3 1 71.0 9808 1 0 0 0 0 1 0 0 0 0 NaN NaN
4 2017090700 20170907000395 away 29.99 27.12 3.90 2.53 0.44 34.27 157.92 2557917 Kareem Hunt 27 2017 25 1 12:08:00 KC 1 10 KC 7 0 2557917 SHOTGUN 1 RB, 3 TE, 1 WR 7 3 DL, 2 LB, 6 DB right 2017-09-08T00:53:14.000Z 2017-09-08T00:53:13.000Z 7 5-11 216 08/06/1995 Toledo RB NE KC 1 Gillette Stadium Foxborough, MA Outdoor Field Turf Clear and warm 63.0 77.0 8 SW 2207.0 30.0 73.566667 25 00:12:08 172 NE 3978.0 3566.0 0 7 -7 5 3 1 1 1 0 3 2 6 0 71.0 8069 1 0 0 0 0 1 0 0 0 0 NaN NaN

Create Classifier Response Variable

In [948]:
runners['FirstDown'] = list(runners.apply(lambda x: 1 if (x['Yards'] >= x['Distance']) else 0, axis=1))
In [949]:
runners['FirstDown'].value_counts()
Out[949]:
0    24415
1     6592
Name: FirstDown, dtype: int64

Predictors

The same predictors will be used in the classification model as in the regression model above.

In [950]:
y_train = runners[['FirstDown']].iloc[X_train.index]
y_test = runners[['FirstDown']].iloc[X_test.index]
In [951]:
y_train
Out[951]:
FirstDown
9619 0
907 0
3899 0
28347 1
30714 0
... ...
29802 0
5390 1
860 0
15795 0
23654 1

23255 rows × 1 columns

Classification Models

In [985]:
classification_models = []
classification_train_ER = []
classification_test_ER = []

Null Model

The null model is defined as the average rushing yards over the entire data set.

In [986]:
y_train.FirstDown.mode()[0]
Out[986]:
0
In [987]:
null_train_predictions = [y_train['FirstDown'].mode()[0]] * len(X_train.index)
null_test_predictions = [y_train['FirstDown'].mode()[0]] * len(X_test.index)
In [988]:
train_ER = 1 - accuracy_score(y_train,null_train_predictions)
test_ER = 1- accuracy_score(y_test, null_test_predictions)
classification_models.append('Null')
classification_train_ER.append(train_ER)
classification_test_ER.append(test_ER)
In [1005]:
train_ER
Out[1005]:
0.21410449365727802
In [1006]:
test_ER
Out[1006]:
0.20807533539731682
In [1018]:
print(classification_report(y_test,null_test_predictions))
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      6139
           1       0.00      0.00      0.00      1613

    accuracy                           0.79      7752
   macro avg       0.40      0.50      0.44      7752
weighted avg       0.63      0.79      0.70      7752

d:\wipdata\miniconda2\envs\adapt_validation\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

In [991]:
classification_models_df = pd.DataFrame({'classification_models': classification_models,'classification_train_ER': classification_train_ER, 'classification_test_ER':classification_test_ER})
In [992]:
classification_models_df
Out[992]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075

Decision Trees

Full Classification Tree

In [1009]:
classification_tree_clf = DecisionTreeClassifier(max_leaf_nodes=10)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
In [1010]:
train_predictions = classification_tree_clf.predict(X_train)
test_predictions = classification_tree_clf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1011]:
train_ER
Out[1011]:
0.17579015265534292
In [1012]:
test_ER
Out[1012]:
0.17814757481940147
In [1013]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      6139
           1       0.62      0.37      0.46      1613

    accuracy                           0.82      7752
   macro avg       0.74      0.65      0.68      7752
weighted avg       0.80      0.82      0.80      7752

In [1014]:
backup = copy.deepcopy(classification_models_df)
In [1015]:
classification_models_df = classification_models_df.append({'classification_models': 'Decision_Tree','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1016]:
classification_models_df
Out[1016]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148

The Training set classification error rate is calculated to be 0.175 while the test error rate is calculated to be 0.178, these are both improvements over the null model.

In [1017]:
dot_data = tree.export_graphviz(classification_tree_clf, out_file=None,
                               feature_names=X_train.columns,
                               filled=True, rounded=True,
                               special_characters=True,
                               class_names=['No', 'Yes'])
graph = graphviz.Source(dot_data)
graph
Out[1017]:
Tree 0 Distance ≤ -1.0 gini = 0.337 samples = 23255 value = [18276, 4979] class = No 1 Distance ≤ -1.524 gini = 0.488 samples = 4468 value = [1886, 2582] class = Yes 0->1 True 2 Distance ≤ -0.476 gini = 0.223 samples = 18787 value = [16390, 2397] class = No 0->2 False 3 YardLine_refactor ≤ 1.829 gini = 0.451 samples = 2841 value = [975, 1866] class = Yes 1->3 4 DefendersInTheBox ≤ 0.585 gini = 0.493 samples = 1627 value = [911, 716] class = No 1->4 7 Distance ≤ -1.786 gini = 0.422 samples = 2187 value = [661, 1526] class = Yes 3->7 8 gini = 0.499 samples = 654 value = [314, 340] class = Yes 3->8 15 gini = 0.391 samples = 1472 value = [392, 1080] class = Yes 7->15 16 gini = 0.469 samples = 715 value = [269, 446] class = Yes 7->16 11 gini = 0.499 samples = 1190 value = [622, 568] class = No 4->11 12 gini = 0.448 samples = 437 value = [289, 148] class = No 4->12 5 Distance ≤ -0.738 gini = 0.389 samples = 1706 value = [1255, 451] class = No 2->5 6 Runner_WR ≤ 0.5 gini = 0.202 samples = 17081 value = [15135, 1946] class = No 2->6 17 gini = 0.431 samples = 922 value = [632, 290] class = No 5->17 18 gini = 0.326 samples = 784 value = [623, 161] class = No 5->18 9 Distance ≤ 0.571 gini = 0.196 samples = 16593 value = [14768, 1825] class = No 6->9 10 gini = 0.373 samples = 488 value = [367, 121] class = No 6->10 13 gini = 0.204 samples = 15286 value = [13520, 1766] class = No 9->13 14 gini = 0.086 samples = 1307 value = [1248, 59] class = No 9->14

From the Full decision tree above, a few important predictors can be noted. The first and second decisions are based on Distance, which makes since because Distance is the variable for distance to first down. So the less yards to the first down makes sense for it to be higher probability of converting a first down. Other important variables are Defenders in the box and field position (YardLine_refactor).

In [1019]:
tree_size = range(2,11)
In [1044]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for size in tree_size:
    tree_model = DecisionTreeClassifier(random_state=0, max_leaf_nodes=size)
    cv_scores = cross_val_score(tree_model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(1 - cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_ER_list.append(1 - accuracy_score(y_train,tree_model.fit(X_train, y_train.values.ravel()).predict(X_train)))
In [1045]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(tree_size, cv_scores_mean, '-o', label='CV Error Rate', alpha=0.9)
ax.plot(tree_size, train_ER_list, '-*', label='Train Error Rate', alpha=0.9)
ax.set_title('Accuracy vs. Terminal Nodes', fontsize=16)
ax.set_xlabel('Terminal Nodes', fontsize=14)
ax.set_ylabel('Classification Rate', fontsize=14)
ax.legend()
Out[1045]:
<matplotlib.legend.Legend at 0x1e98c837848>

From Cross Validation, the number of terminal nodes is selected to be 8.

In [1112]:
classification_tree_clf = DecisionTreeClassifier(max_leaf_nodes=8)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
In [1113]:
train_predictions = classification_tree_clf.predict(X_train)
test_predictions = classification_tree_clf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1114]:
train_ER
Out[1114]:
0.17579015265534292
In [1115]:
test_ER
Out[1115]:
0.17814757481940147
In [1116]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      6139
           1       0.62      0.37      0.46      1613

    accuracy                           0.82      7752
   macro avg       0.74      0.65      0.68      7752
weighted avg       0.80      0.82      0.80      7752

In [1117]:
backup = copy.deepcopy(classification_models_df)
In [1118]:
classification_models_df = classification_models_df.append({'classification_models': 'Pruned_Tree','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1119]:
classification_models_df
Out[1119]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148

The Training set classification error rate is calculated to be 0.175 while the test error rate is calculated to be 0.178, these are both exactly the same as the full decision tree. This is due to the fact that even though the cross validation error rate was less for 8 terminal nodes, when rebuilding on the full training set, it is essentiall the exact same tree as the original full 10 terminal node decision tree. The gini coefficients and decisions are slightly different but the terminal nodes still have the same essentially the same data. Notice that the last splits on each branch all still hold the same classification on each node, so there is no change in the actual values obtained from evaluating the seperate models. See below:

In [1058]:
dot_data = tree.export_graphviz(classification_tree_clf, out_file=None,
                               feature_names=X_train.columns,
                               filled=True, rounded=True,
                               special_characters=True,
                               class_names=['No', 'Yes'])
graph = graphviz.Source(dot_data)
graph
Out[1058]:
Tree 0 Distance ≤ -1.0 gini = 0.337 samples = 23255 value = [18276, 4979] class = No 1 Distance ≤ -1.524 gini = 0.488 samples = 4468 value = [1886, 2582] class = Yes 0->1 True 2 Distance ≤ -0.476 gini = 0.223 samples = 18787 value = [16390, 2397] class = No 0->2 False 3 YardLine_refactor ≤ 1.829 gini = 0.451 samples = 2841 value = [975, 1866] class = Yes 1->3 4 DefendersInTheBox ≤ 0.585 gini = 0.493 samples = 1627 value = [911, 716] class = No 1->4 7 gini = 0.422 samples = 2187 value = [661, 1526] class = Yes 3->7 8 gini = 0.499 samples = 654 value = [314, 340] class = Yes 3->8 11 gini = 0.499 samples = 1190 value = [622, 568] class = No 4->11 12 gini = 0.448 samples = 437 value = [289, 148] class = No 4->12 5 gini = 0.389 samples = 1706 value = [1255, 451] class = No 2->5 6 Runner_WR ≤ 0.5 gini = 0.202 samples = 17081 value = [15135, 1946] class = No 2->6 9 Distance ≤ 0.571 gini = 0.196 samples = 16593 value = [14768, 1825] class = No 6->9 10 gini = 0.373 samples = 488 value = [367, 121] class = No 6->10 13 gini = 0.204 samples = 15286 value = [13520, 1766] class = No 9->13 14 gini = 0.086 samples = 1307 value = [1248, 59] class = No 9->14

Cost Complexity Tree Pruning

In [1059]:
classification_tree_clf = DecisionTreeClassifier(max_leaf_nodes=10)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
In [1061]:
path = classification_tree_clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
In [1062]:
ccp_alphas
Out[1062]:
array([0.        , 0.00043435, 0.00050007, 0.00051306, 0.00052835,
       0.00077603, 0.00137004, 0.00301882, 0.00417952, 0.06294624])
In [1066]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
Out[1066]:
Text(0.5, 1.0, 'Total Impurity vs effective alpha for training set')
In [1078]:
alphas = np.linspace(0,max(ccp_alphas[:-1]),20)
In [1079]:
alphas
Out[1079]:
array([0.        , 0.00021997, 0.00043995, 0.00065992, 0.0008799 ,
       0.00109987, 0.00131985, 0.00153982, 0.0017598 , 0.00197977,
       0.00219975, 0.00241972, 0.0026397 , 0.00285967, 0.00307965,
       0.00329962, 0.00351959, 0.00373957, 0.00395954, 0.00417952])
In [1081]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for alpha in alphas:
    tree_model = DecisionTreeClassifier(random_state=0, max_leaf_nodes=10,ccp_alpha=alpha)
    cv_scores = cross_val_score(tree_model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(1 - cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_ER_list.append(1 - accuracy_score(y_train,tree_model.fit(X_train, y_train.values.ravel()).predict(X_train)))
In [1083]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV Error Rate', alpha=0.9)
ax.plot(alphas, train_ER_list, '-*', label='Train Error Rate', alpha=0.9)
ax.set_title('Accuracy vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter alpha', fontsize=14)
ax.set_ylabel('Classification Rate', fontsize=14)
ax.legend()
Out[1083]:
<matplotlib.legend.Legend at 0x1e988b50fc8>
In [1084]:
alphas[3]
Out[1084]:
0.0006599240435130342

From the Cross validation on Tuning parameter, a tuning parameter of 0.00066 will be used for the Cost Complexity Pruning Classification Model.

In [1120]:
classification_tree_clf = DecisionTreeClassifier(ccp_alpha=0.00066)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
In [1121]:
train_predictions = classification_tree_clf.predict(X_train)
test_predictions = classification_tree_clf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1122]:
train_ER
Out[1122]:
0.17579015265534292
In [1123]:
test_ER
Out[1123]:
0.17814757481940147
In [1124]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      6139
           1       0.62      0.37      0.46      1613

    accuracy                           0.82      7752
   macro avg       0.74      0.65      0.68      7752
weighted avg       0.80      0.82      0.80      7752

In [1125]:
backup = copy.deepcopy(classification_models_df)
In [1126]:
classification_models_df = classification_models_df.append({'classification_models': 'Cost_Complexity_Pruned_Tree','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1127]:
classification_models_df
Out[1127]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148

The Training set classification error rate is calculated to be 0.175 while the test error rate is calculated to be 0.178, these are both exactly the same as the full decision tree (and the previously pruned tree on number of terminal nodes). This is due to the fact that even though the cross validation error rate was less for a tuning parameter of 0.00066, when rebuilding on the full training set, it is essentiall the exact same tree as the original full 10 terminal node decision tree. The gini coefficients and decisions are slightly different but the terminal nodes still have the same essentially the same data. See below:

In [1128]:
dot_data = tree.export_graphviz(classification_tree_clf, out_file=None,
                               feature_names=X_train.columns,
                               filled=True, rounded=True,
                               special_characters=True,
                               class_names=['No', 'Yes'])
graph = graphviz.Source(dot_data)
graph
Out[1128]:
Tree 0 Distance ≤ -1.0 gini = 0.337 samples = 23255 value = [18276, 4979] class = No 1 Distance ≤ -1.524 gini = 0.488 samples = 4468 value = [1886, 2582] class = Yes 0->1 True 6 Distance ≤ -0.476 gini = 0.223 samples = 18787 value = [16390, 2397] class = No 0->6 False 2 YardLine_refactor ≤ 1.829 gini = 0.451 samples = 2841 value = [975, 1866] class = Yes 1->2 5 gini = 0.493 samples = 1627 value = [911, 716] class = No 1->5 3 gini = 0.422 samples = 2187 value = [661, 1526] class = Yes 2->3 4 gini = 0.499 samples = 654 value = [314, 340] class = Yes 2->4 7 gini = 0.389 samples = 1706 value = [1255, 451] class = No 6->7 8 Runner_WR ≤ 0.5 gini = 0.202 samples = 17081 value = [15135, 1946] class = No 6->8 9 gini = 0.196 samples = 16593 value = [14768, 1825] class = No 8->9 10 gini = 0.373 samples = 488 value = [367, 121] class = No 8->10

Bagging

In [1171]:
clf_bagging = RandomForestClassifier(max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = len(X_train.columns))
In [1172]:
clf_bagging = clf_bagging.fit(X_train,y_train.values.ravel())
In [1176]:
oob_ER = 1 - clf_bagging.oob_score_
In [1177]:
oob_ER
Out[1177]:
0.17226402924102346
In [1178]:
train_predictions = clf_bagging.predict(X_train)
test_predictions = clf_bagging.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1179]:
train_ER
Out[1179]:
0.1710599870995485
In [1180]:
test_ER
Out[1180]:
0.17427760577915374
In [1181]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.86      0.94      0.89      6139
           1       0.62      0.41      0.49      1613

    accuracy                           0.83      7752
   macro avg       0.74      0.67      0.69      7752
weighted avg       0.81      0.83      0.81      7752

In [1182]:
backup = copy.deepcopy(classification_models_df)
In [1183]:
classification_models_df = classification_models_df.append({'classification_models': 'Bagging','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1184]:
classification_models_df
Out[1184]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278

A bagging approach on the classification tree led to an out of bag error of 0.172 and a training error rate of 0.171. When this model was applied to the test set an error rate of 0.174 was obtained, which is smaller than the full decision tree and pruned trees.

In [1185]:
clf_bagging.feature_importances_
Out[1185]:
array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.46384163e-04,
       9.65066222e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.46202249e-03,
       2.04991454e-02, 7.57453403e-04, 1.07008633e-03, 1.98659556e-04,
       1.20290505e-03, 9.52451859e-01, 2.14830928e-04, 3.59195570e-04,
       0.00000000e+00, 0.00000000e+00, 7.41885645e-05, 3.34991267e-04,
       0.00000000e+00, 8.41352945e-03, 5.44217023e-04, 4.01494765e-04,
       7.04334524e-05, 1.14794139e-03])
In [1186]:
tree_importance_sorted_idx = np.argsort(clf_bagging.feature_importances_)
tree_indices = np.arange(0, len(clf_bagging.feature_importances_)) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_bagging.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_bagging.feature_importances_)))
ax1.set_title('Bagging Variable Importance')
Out[1186]:
Text(0.5, 1.0, 'Bagging Variable Importance')

The variable importance on the bagging model shows a very interesting result. Distance to first down is by far the most important predictor to determining if the rush will obtain a first down. Other slightly important variables are the field position (YardLine_refactor), if the runner is a wide receiver and the amount of defenders in the box.

Random Forest

In [1187]:
len(X_train.columns)
Out[1187]:
30
In [1188]:
np.sqrt(len(X_train.columns))
Out[1188]:
5.477225575051661
In [1189]:
num_predictors = range(1,len(X_train.columns)+1)
In [1190]:
oob_ER_list = []
train_ER_list = []

for num in num_predictors:
    clf_rf = RandomForestClassifier(max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = num)
    clf_rf = clf_rf.fit(X_train,y_train.values.ravel())

    oob_ER = 1 - clf_rf.oob_score_
    oob_ER_list.append(oob_ER)
    
    train_predictions = clf_rf.predict(X_train)
    train_ER = 1- accuracy_score(y_train,train_predictions)
    train_ER_list.append(train_ER)
    
In [1191]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(num_predictors, oob_ER_list, '-o', label='OOB ', alpha=0.9)
ax.plot(num_predictors, train_ER_list, '-*', label='train', alpha=0.9)
ax.set_title('Random Forest: Classification Error Rate vs. Number of Predictors', fontsize=16)
ax.set_xlabel('Number of Predictors', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.legend()
Out[1191]:
<matplotlib.legend.Legend at 0x1e98ed60dc8>

From this plot it appears the the out of bag classification error rate reaches a minimum at 15 predictors.

In [1192]:
number_trees = np.arange(25,525,25)
In [1194]:
oob_ER_list = []
train_ER_list = []

for num in number_trees:
    clf_rf = RandomForestClassifier(n_estimators=num, max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = 15)
    clf_rf = clf_rf.fit(X_train,y_train.values.ravel())

    oob_ER = 1 - clf_rf.oob_score_
    oob_ER_list.append(oob_ER)
    
    train_predictions = clf_rf.predict(X_train)
    train_ER = 1- accuracy_score(y_train,train_predictions)
    train_ER_list.append(train_ER)
    print(num)
    
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
In [1195]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees, oob_ER_list, '-o', label='OOB ', alpha=0.9)
ax.plot(number_trees, train_ER_list, '-*', label='train', alpha=0.9)
ax.set_title('Random Forest: Classification Error Rate vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.legend()
Out[1195]:
<matplotlib.legend.Legend at 0x1e990c30508>

From this plot it appears that the out of bag Classification Error Rate reaches a minimum around 100 Trees. A Random Forest with 15 predictors and 100 Trees will be used as the final Random Forest Model.

In [1367]:
clf_rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes = 10, oob_score=True,
                                     random_state=0, max_features = 15)
In [1368]:
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
In [1200]:
oob_ER = 1 - clf_rf.oob_score_
In [1201]:
oob_ER
Out[1201]:
0.17200602021070732
In [1202]:
train_predictions = clf_rf.predict(X_train)
test_predictions = clf_rf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1203]:
train_ER
Out[1203]:
0.17101698559449585
In [1204]:
test_ER
Out[1204]:
0.17479360165118674
In [1205]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      6139
           1       0.62      0.41      0.50      1613

    accuracy                           0.83      7752
   macro avg       0.74      0.67      0.70      7752
weighted avg       0.81      0.83      0.81      7752

In [1206]:
backup = copy.deepcopy(classification_models_df)
In [1207]:
classification_models_df = classification_models_df.append({'classification_models': 'Random Forest','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1208]:
classification_models_df
Out[1208]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794

The random forest model with 15 predictors and 100 trees had an out of bag classification error rate of 0.172 and a training set error rate of 0.171, slightly smaller than the Bagging Training set classification error rate. However, the test set classification error rate is 0.1748, slightly larger than the test set bagging classification error rate. This shows that there is a slightly higher variance in the random forest approach here than in the bagging approach.

In [1369]:
clf_rf.feature_importances_
Out[1369]:
array([0.00000000e+00, 1.04649998e-04, 0.00000000e+00, 7.44262206e-05,
       7.33930604e-03, 1.05162426e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 9.41937481e-05, 0.00000000e+00, 3.16041053e-03,
       2.85601673e-02, 1.22239926e-03, 8.63319983e-04, 6.88668385e-04,
       1.29337744e-01, 8.10649567e-01, 1.21926525e-03, 4.17672868e-04,
       1.93267279e-04, 1.56783511e-04, 5.46007458e-03, 6.57539789e-04,
       5.88919348e-05, 6.93680816e-03, 3.70222671e-04, 1.14790066e-03,
       3.98371203e-04, 7.83186788e-04])
In [1370]:
tree_importance_sorted_idx = np.argsort(clf_rf.feature_importances_)
tree_indices = np.arange(0, len(clf_rf.feature_importances_)) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_rf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_rf.feature_importances_)))
ax1.set_title('Random Forest Classifier Variable Importance')
Out[1370]:
Text(0.5, 1.0, 'Random Forest Classifier Variable Importance')

Similar to the bagging classifier, the distance to first down predictor is by far the most important variable for determining if a rush will make a first down. However, unlike in the bagging classifier, down number is the second most important variable to determing if a run will make a first down. Other variables that show slight importance are field position, the binary value for if the rusher is a WR, the number of defenders in the box, the number of wide receivers and the rushers yards per game.

Linear Regression

Note that this approach is not advisable for classification. This is just a study to see the effects.

In [1209]:
clf_linear = LinearRegression()
clf_linear = clf_linear.fit(X_train,y_train)
In [1213]:
train_probabilities = clf_linear.predict(X_train)
test_probabilities = clf_linear.predict(X_test)
In [1218]:
train_predictions = np.where(train_probabilities > 0.5, 1, 0)
test_predictions = np.where(test_probabilities > 0.5, 1, 0)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1219]:
train_ER
Out[1219]:
0.17789722640292405
In [1220]:
test_ER
Out[1220]:
0.17711558307533537
In [1221]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1221]:
No Yes
No 17549 727
Yes 3410 1569
In [1222]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.84      0.96      0.90      6139
           1       0.66      0.31      0.42      1613

    accuracy                           0.82      7752
   macro avg       0.75      0.63      0.66      7752
weighted avg       0.80      0.82      0.80      7752

In [1223]:
backup = copy.deepcopy(classification_models_df)
In [1224]:
classification_models_df = classification_models_df.append({'classification_models': 'Linear_Regression','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1225]:
classification_models_df
Out[1225]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116

The linear regression approach (typically not advisable for classification) has a training set classification error rate of 0.178 and a test set classification error rate of 0.177. This test set classification error rate is higher than the error rates for bagging and random forest.

In [1371]:
clf_linear.coef_
Out[1371]:
array([[ 0.01124194,  0.00109001,  0.04399242,  0.0027565 ,  0.13285306,
         0.1005608 ,  0.00838987, -0.05678443,  0.00297359, -0.0278388 ,
        -0.02114311,  0.01179895, -0.00956719, -0.00257481,  0.01092201,
         0.00934385,  0.04289596, -0.15409164, -0.00482926,  0.00701215,
        -0.00385777, -0.01568355, -0.0207573 , -0.01008057, -0.01459392,
        -0.01733808, -0.00396055, -0.0057645 ,  0.00232537, -0.00202881]])
In [1372]:
importance_sorted_idx = np.argsort(clf_linear.coef_[0])
tree_indices = np.arange(0, len(clf_linear.coef_[0])) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_linear.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
Out[1372]:
(0, 30)

From the coefficients for each of the predictors in the linear regression model, we find the following information for predicting whether a rush will reach a first down. If the rusher is a wide receiver or fullback the probability of the play gaining a first down is more likely. Also if the play comes from the Wilcat Formation it has a correlation with running plays getting a first down. Finally, as the down increases, the more likely it is that the rush will go for a fist down. However, Distance to go for first down is negatively correlated with whether the run will make a first down. other negative correlations to a run making a first down are if the rusher is a quarterback, if the game is in a retractable roof stadium with the roof open and if the wether is rainy.

Logistic Regression

In [1229]:
clf_logistic = LogisticRegression()
clf_logistic = clf_logistic.fit(X_train,y_train.values.ravel())
In [1230]:
train_probabilities = clf_logistic.predict(X_train)
test_probabilities = clf_logistic.predict(X_test)
In [1231]:
train_predictions = np.where(train_probabilities > 0.5, 1, 0)
test_predictions = np.where(test_probabilities > 0.5, 1, 0)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1232]:
train_ER
Out[1232]:
0.17269404429155022
In [1233]:
test_ER
Out[1233]:
0.1750515995872033
In [1234]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1234]:
No Yes
No 17216 1060
Yes 2956 2023
In [1235]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      6139
           1       0.62      0.41      0.49      1613

    accuracy                           0.82      7752
   macro avg       0.74      0.67      0.69      7752
weighted avg       0.81      0.82      0.81      7752

In [1236]:
backup = copy.deepcopy(classification_models_df)
In [1239]:
classification_models_df = classification_models_df.append({'classification_models': 'Logistic_Regression','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1240]:
classification_models_df
Out[1240]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116
7 Logistic_Regression 0.172694 0.175052

The logistic regression approach gave a training error rate of 0.172 and a test error rate of 0.175. These were both better than the linear regression approach but worse than bagging and random forest results.

In [1373]:
clf_logistic.coef_
Out[1373]:
array([[ 0.0794587 , -0.03140985,  0.19199125,  0.03699359,  0.94521331,
         0.54423935,  0.065376  , -0.37820901,  0.0124539 , -0.21048324,
        -0.14613935,  0.08769033, -0.10106874, -0.01023267,  0.07619419,
         0.07864469,  0.00246348, -1.22991533, -0.01009433,  0.03275058,
        -0.03502216, -0.11093187, -0.14925183, -0.05894504, -0.08739867,
        -0.1654048 , -0.02854282, -0.04230991,  0.01473512, -0.0294868 ]])
In [1374]:
importance_sorted_idx = np.argsort(clf_logistic.coef_[0])
tree_indices = np.arange(0, len(clf_logistic.coef_[0])) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_logistic.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
Out[1374]:
(0, 30)

The coefficients for the logistic regression model follow similar trends to the coefficients in the linear regression for the most part. f the rusher is a wide receiver or fullback the probability of the play gaining a first down is more likely. Also if the play comes from the Wilcat Formation it has a correlation with running plays getting a first down. Finally, the rushers yards per game has a positive correlation to whether the run will reach a first down. However, Distance to go for first down is negatively correlated with whether the run will make a first down. other negative correlations to a run making a first down are if the rusher is a quarterback, if the game is in a retractable roof stadium with the roof open and if the wether is rainy. The number of defenders in the box has a negative correlation with the first down binary value, meaning that as there are more players in the box on defense the less likely it will be that the run will make a first down.

Linear Discriminant Analysis

In [1242]:
clf_lda = LinearDiscriminantAnalysis()
In [1243]:
clf_lda = clf_lda.fit(X_train,y_train.values.ravel())
In [1244]:
train_predictions = clf_lda.predict(X_train)
test_predictions = clf_lda.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1245]:
train_ER
Out[1245]:
0.17333906686734035
In [1246]:
test_ER
Out[1246]:
0.17234262125902988
In [1247]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1247]:
No Yes
No 17107 1169
Yes 2862 2117
In [1248]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      6139
           1       0.63      0.43      0.51      1613

    accuracy                           0.83      7752
   macro avg       0.74      0.68      0.70      7752
weighted avg       0.81      0.83      0.81      7752

In [1249]:
backup = copy.deepcopy(classification_models_df)
In [1250]:
classification_models_df = classification_models_df.append({'classification_models': 'Linear_Discriminant_Analysis','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1251]:
classification_models_df
Out[1251]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116
7 Logistic_Regression 0.172694 0.175052
8 Linear_Discriminant_Analysis 0.173339 0.172343

The linear discriminant analysis approach gave a training set error rate of 0.173 and a test set error rate of 0.172. This test error rate is the lowest error rate obtained for the classification model thus far.

In [1375]:
importance_sorted_idx = np.argsort(clf_lda.coef_[0])
tree_indices = np.arange(0, len(clf_lda.coef_[0])) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_lda.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
Out[1375]:
(0, 30)

The coefficients for LDA show similar trends to the logistic regression.

Quadratic Discriminant Analysis

In [1264]:
clf_qda = QuadraticDiscriminantAnalysis()
In [1265]:
clf_qda = clf_qda.fit(X_train,y_train.values.ravel())
In [1266]:
train_predictions = clf_qda.predict(X_train)
test_predictions = clf_qda.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1267]:
train_ER
Out[1267]:
0.2032681143840035
In [1268]:
test_ER
Out[1268]:
0.2053663570691434
In [1269]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1269]:
No Yes
No 16956 1320
Yes 3407 1572
In [1270]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      6139
           1       0.51      0.29      0.37      1613

    accuracy                           0.79      7752
   macro avg       0.67      0.61      0.62      7752
weighted avg       0.77      0.79      0.77      7752

In [1271]:
backup = copy.deepcopy(classification_models_df)
In [1272]:
classification_models_df = classification_models_df.append({'classification_models': 'Quadratic_Discriminant_Analysis','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1273]:
classification_models_df
Out[1273]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116
7 Logistic_Regression 0.172694 0.175052
8 Linear_Discriminant_Analysis 0.173339 0.172343
9 Quadratic_Discriminant_Analysis 0.203268 0.205366

The quadratic discriminant analysis approach gave a training set error rate of 0.203 and a test set error rate of 0.205. This test error rate is significantly higher than all of the other models tested and only slightly better than the null model. Perhaps assuming that each class has its own covariance matri is a bad assumption, because LDA, with its common covariance matrix assumption, has lower classification error rate. Also, since QDA is more flexible perhaps it is allowing too much flexibility in the model, although this does not show up as overfitting issue because the training error rate is also higher than the others.

Shrinkage Methods

Ridge Regression

In [1276]:
reg = RidgeClassifierCV(alphas=np.logspace(-6, 6, 13),
                          cv=5)
ridge_model = reg.fit(X_train,y_train.values.ravel())
ridge_model.alpha_
Out[1276]:
1e-06
In [1278]:
lamda_cv = ridge_model.alpha_
lamda_cv
Out[1278]:
1e-06
In [1279]:
alphas=np.logspace(-6, 6, 13)
In [1284]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for alpha in alphas:
    clf_ridge = RidgeClassifier(alpha=alpha)
    cv_scores = cross_val_score(clf_ridge, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(1- cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_ER_list.append(1- accuracy_score(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
In [1287]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV ', alpha=0.9)
ax.plot(alphas, train_ER_list, '-*', label='train', alpha=0.9)
ax.set_title('Ridge Classification: Classification Error Rate vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('tuning Parameter', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.set_xscale('log')
ax.legend()
Out[1287]:
<matplotlib.legend.Legend at 0x1e992674848>

From the above plot, a tuning parameter of 1e-6 will be used in the Ridge Classification Model

In [1289]:
clf_ridge = RidgeClassifier(alpha=lamda_cv)
In [1290]:
clf_ridge = clf_ridge.fit(X_train,y_train.values.ravel())
In [1291]:
train_predictions = clf_ridge.predict(X_train)
test_predictions = clf_ridge.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1292]:
train_ER
Out[1292]:
0.17789722640292405
In [1293]:
test_ER
Out[1293]:
0.17711558307533537
In [1294]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1294]:
No Yes
No 17549 727
Yes 3410 1569
In [1295]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.84      0.96      0.90      6139
           1       0.66      0.31      0.42      1613

    accuracy                           0.82      7752
   macro avg       0.75      0.63      0.66      7752
weighted avg       0.80      0.82      0.80      7752

In [1300]:
backup = copy.deepcopy(classification_models_df)
In [1301]:
classification_models_df = classification_models_df.append({'classification_models': 'Ridge_Classification','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1302]:
classification_models_df
Out[1302]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116
7 Logistic_Regression 0.172694 0.175052
8 Linear_Discriminant_Analysis 0.173339 0.172343
9 Quadratic_Discriminant_Analysis 0.203268 0.205366
10 Ridge_Classification 0.177897 0.177116

The ridge classification gave essentially the same results and model as the linear regression model. The test error rate of 0.177 is higher than most other models except QDA.

In [1378]:
importance_sorted_idx = np.argsort(clf_ridge.coef_[0])
tree_indices = np.arange(0, len(clf_ridge.coef_[0])) + 0.5

fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
         clf_ridge.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
Out[1378]:
(0, 30)

K Nearest Neighbors

K Nearest neighbors is a non-parametric model that will be used for classification. Cross-validation on the training set will be used to determine the number of neighbors for the final model.

In [1325]:
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for neighbors in number_neighbors:
    clf_knn = KNeighborsClassifier(n_neighbors=neighbors)
    cv_scores = cross_val_score(clf_knn, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    cv_scores_list.append(cv_scores)
    cv_scores_mean.append(1- cv_scores.mean())
    cv_scores_std.append(cv_scores.std())
    train_ER_list.append(1- accuracy_score(y_train, clf_knn.fit(X_train, y_train.values.ravel()).predict(X_train)))
    print(neighbors)
5
30
55
80
In [1329]:
number_neighbors2 = np.arange(35,55,5)
In [1330]:
cv_scores_list2 = []
cv_scores_std2 = []
cv_scores_mean2 = []
train_ER_list2 = []
for neighbors in number_neighbors2:
    clf_knn = KNeighborsClassifier(n_neighbors=neighbors)
    cv_scores = cross_val_score(clf_knn, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    cv_scores_list.append(cv_scores)
    cv_scores_mean2.append(1- cv_scores.mean())
    cv_scores_std2.append(cv_scores.std())
    train_ER_list2.append(1- accuracy_score(y_train, clf_knn.fit(X_train, y_train.values.ravel()).predict(X_train)))
    print(neighbors)
35
40
45
50
In [1339]:
k_neighbors = list(number_neighbors[:2]) + list(number_neighbors2) + list(number_neighbors[2:])
training_k_neighbors = list(train_ER_list[:2]) + list(train_ER_list2) + list(train_ER_list[2:])
cv_scores_mean_knn = list(cv_scores_mean[:2]) + list(cv_scores_mean2) + list(cv_scores_mean[2:])
In [1341]:
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(k_neighbors, cv_scores_mean_knn, '-o', label='CV ', alpha=0.9)
ax.plot(k_neighbors, training_k_neighbors, '-*', label='train', alpha=0.9)
ax.set_title('KNN Classification: Classification Error Rate vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Neighbors', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.legend()
Out[1341]:
<matplotlib.legend.Legend at 0x1e991248ac8>

From the plot above, it appears the classification error rate reaches a minimum at n=35 neighbors. This will be used for the final knn model.

In [1342]:
clf_knn = KNeighborsClassifier(n_neighbors=35)
In [1343]:
clf_knn = clf_knn.fit(X_train,y_train.values.ravel())
In [1344]:
train_predictions = clf_knn.predict(X_train)
test_predictions = clf_knn.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1345]:
train_ER
Out[1345]:
0.1834874220597721
In [1346]:
test_ER
Out[1346]:
0.18988648090815274
In [1347]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1347]:
No Yes
No 17765 511
Yes 3756 1223
In [ ]:
 
In [1348]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      6139
           1       0.63      0.21      0.32      1613

    accuracy                           0.81      7752
   macro avg       0.73      0.59      0.60      7752
weighted avg       0.78      0.81      0.77      7752

In [1349]:
backup = copy.deepcopy(classification_models_df)
In [1350]:
classification_models_df = classification_models_df.append({'classification_models': 'KNN_Classification','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1351]:
classification_models_df
Out[1351]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116
7 Logistic_Regression 0.172694 0.175052
8 Linear_Discriminant_Analysis 0.173339 0.172343
9 Quadratic_Discriminant_Analysis 0.203268 0.205366
10 Ridge_Classification 0.177897 0.177116
11 KNN_Classification 0.183487 0.189886

The KNN classification with 35 neighbors gave a training set error rate of 0.183 and a test set error rate of 0.190. This is higher than all models other than the QDA and the Null Model.

Naive Bayes

Gaussian Naive Bayes

In [1352]:
clf_gnb = GaussianNB()
In [1353]:
clf_gnb = clf_gnb.fit(X_train,y_train.values.ravel())
In [1354]:
train_predictions = clf_gnb.predict(X_train)
test_predictions = clf_gnb.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
In [1355]:
train_ER
Out[1355]:
0.20847129649537732
In [1356]:
test_ER
Out[1356]:
0.211687306501548
In [1357]:
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
Out[1357]:
No Yes
No 16607 1669
Yes 3179 1800
In [1358]:
print(classification_report(y_test,test_predictions))
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      6139
           1       0.49      0.33      0.40      1613

    accuracy                           0.79      7752
   macro avg       0.66      0.62      0.63      7752
weighted avg       0.77      0.79      0.77      7752

In [1359]:
backup = copy.deepcopy(classification_models_df)
In [1360]:
classification_models_df = classification_models_df.append({'classification_models': 'Gaussian_Naive_Bayes','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
In [1361]:
classification_models_df
Out[1361]:
classification_models classification_train_ER classification_test_ER
0 Null 0.214104 0.208075
1 Decision_Tree 0.175790 0.178148
2 Pruned_Tree 0.175790 0.178148
3 Cost_Complexity_Pruned_Tree 0.175790 0.178148
4 Bagging 0.171060 0.174278
5 Random Forest 0.171017 0.174794
6 Linear_Regression 0.177897 0.177116
7 Logistic_Regression 0.172694 0.175052
8 Linear_Discriminant_Analysis 0.173339 0.172343
9 Quadratic_Discriminant_Analysis 0.203268 0.205366
10 Ridge_Classification 0.177897 0.177116
11 KNN_Classification 0.183487 0.189886
12 Gaussian_Naive_Bayes 0.208471 0.211687

The Gaussian Naive Bayes had a training error rate of 0.208 and test error rate of 0.212. This test error rate is higher than all models, including the Null Model.

Classification Model Comparison

After analyzing 12 different classification models (in addition to the null model) for determining whether a rush will

Classification Model Conclusion

In [ ]: